Import 'regex' package vesion 1.3.6 * Add OWNERS * No Android.bp yet Bug: 152884384 Test: make Change-Id: I455caf7833b6c437c1c133bc7b2f47b83da9cbce

commit: e42c505f54ac2e7b2ca7d0197304cac1b4f605e9 [log] [tgz]
author: Chih-Hung Hsieh <chh@google.com> Thu Apr 16 10:44:21 2020 -0700
committer: Chih-Hung Hsieh <chh@google.com> Thu Apr 16 10:45:53 2020 -0700
tree: 7ca22ebdbdb7d0c9dc5c66d31f564aeaf51cece4
parent: 815d0a2344b5321fd47f0cb7dba96e4cb4e84615 [diff] [blame]
diff --git a/src/literal/imp.rs b/src/literal/imp.rs
new file mode 100644
index 0000000..fe07ffc
--- /dev/null
+++ b/src/literal/imp.rs

@@ -0,0 +1,1121 @@
+use std::cmp;
+use std::mem;
+
+use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder};
+use memchr::{memchr, memchr2, memchr3};
+use syntax::hir::literal::{Literal, Literals};
+
+use freqs::BYTE_FREQUENCIES;
+
+/// A prefix extracted from a compiled regular expression.
+///
+/// A regex prefix is a set of literal strings that *must* be matched at the
+/// beginning of a regex in order for the entire regex to match. Similarly
+/// for a regex suffix.
+#[derive(Clone, Debug)]
+pub struct LiteralSearcher {
+    complete: bool,
+    lcp: FreqyPacked,
+    lcs: FreqyPacked,
+    matcher: Matcher,
+}
+
+#[derive(Clone, Debug)]
+enum Matcher {
+    /// No literals. (Never advances through the input.)
+    Empty,
+    /// A set of four or more single byte literals.
+    Bytes(SingleByteSet),
+    /// A single substring, find using memchr and frequency analysis.
+    FreqyPacked(FreqyPacked),
+    /// A single substring, find using Boyer-Moore.
+    BoyerMoore(BoyerMooreSearch),
+    /// An Aho-Corasick automaton.
+    AC { ac: AhoCorasick<u32>, lits: Vec<Literal> },
+    /// A packed multiple substring searcher, using SIMD.
+    ///
+    /// Note that Aho-Corasick will actually use this packed searcher
+    /// internally automatically, however, there is some overhead associated
+    /// with going through the Aho-Corasick machinery. So using the packed
+    /// searcher directly results in some gains.
+    Packed { s: packed::Searcher, lits: Vec<Literal> },
+}
+
+impl LiteralSearcher {
+    /// Returns a matcher that never matches and never advances the input.
+    pub fn empty() -> Self {
+        Self::new(Literals::empty(), Matcher::Empty)
+    }
+
+    /// Returns a matcher for literal prefixes from the given set.
+    pub fn prefixes(lits: Literals) -> Self {
+        let matcher = Matcher::prefixes(&lits);
+        Self::new(lits, matcher)
+    }
+
+    /// Returns a matcher for literal suffixes from the given set.
+    pub fn suffixes(lits: Literals) -> Self {
+        let matcher = Matcher::suffixes(&lits);
+        Self::new(lits, matcher)
+    }
+
+    fn new(lits: Literals, matcher: Matcher) -> Self {
+        let complete = lits.all_complete();
+        LiteralSearcher {
+            complete: complete,
+            lcp: FreqyPacked::new(lits.longest_common_prefix().to_vec()),
+            lcs: FreqyPacked::new(lits.longest_common_suffix().to_vec()),
+            matcher: matcher,
+        }
+    }
+
+    /// Returns true if all matches comprise the entire regular expression.
+    ///
+    /// This does not necessarily mean that a literal match implies a match
+    /// of the regular expression. For example, the regular expresison `^a`
+    /// is comprised of a single complete literal `a`, but the regular
+    /// expression demands that it only match at the beginning of a string.
+    pub fn complete(&self) -> bool {
+        self.complete && !self.is_empty()
+    }
+
+    /// Find the position of a literal in `haystack` if it exists.
+    #[cfg_attr(feature = "perf-inline", inline(always))]
+    pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> {
+        use self::Matcher::*;
+        match self.matcher {
+            Empty => Some((0, 0)),
+            Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)),
+            FreqyPacked(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
+            BoyerMoore(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
+            AC { ref ac, .. } => {
+                ac.find(haystack).map(|m| (m.start(), m.end()))
+            }
+            Packed { ref s, .. } => {
+                s.find(haystack).map(|m| (m.start(), m.end()))
+            }
+        }
+    }
+
+    /// Like find, except matches must start at index `0`.
+    pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> {
+        for lit in self.iter() {
+            if lit.len() > haystack.len() {
+                continue;
+            }
+            if lit == &haystack[0..lit.len()] {
+                return Some((0, lit.len()));
+            }
+        }
+        None
+    }
+
+    /// Like find, except matches must end at index `haystack.len()`.
+    pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> {
+        for lit in self.iter() {
+            if lit.len() > haystack.len() {
+                continue;
+            }
+            if lit == &haystack[haystack.len() - lit.len()..] {
+                return Some((haystack.len() - lit.len(), haystack.len()));
+            }
+        }
+        None
+    }
+
+    /// Returns an iterator over all literals to be matched.
+    pub fn iter(&self) -> LiteralIter {
+        match self.matcher {
+            Matcher::Empty => LiteralIter::Empty,
+            Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense),
+            Matcher::FreqyPacked(ref s) => LiteralIter::Single(&s.pat),
+            Matcher::BoyerMoore(ref s) => LiteralIter::Single(&s.pattern),
+            Matcher::AC { ref lits, .. } => LiteralIter::AC(lits),
+            Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits),
+        }
+    }
+
+    /// Returns a matcher for the longest common prefix of this matcher.
+    pub fn lcp(&self) -> &FreqyPacked {
+        &self.lcp
+    }
+
+    /// Returns a matcher for the longest common suffix of this matcher.
+    pub fn lcs(&self) -> &FreqyPacked {
+        &self.lcs
+    }
+
+    /// Returns true iff this prefix is empty.
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    /// Returns the number of prefixes in this machine.
+    pub fn len(&self) -> usize {
+        use self::Matcher::*;
+        match self.matcher {
+            Empty => 0,
+            Bytes(ref sset) => sset.dense.len(),
+            FreqyPacked(_) => 1,
+            BoyerMoore(_) => 1,
+            AC { ref ac, .. } => ac.pattern_count(),
+            Packed { ref lits, .. } => lits.len(),
+        }
+    }
+
+    /// Return the approximate heap usage of literals in bytes.
+    pub fn approximate_size(&self) -> usize {
+        use self::Matcher::*;
+        match self.matcher {
+            Empty => 0,
+            Bytes(ref sset) => sset.approximate_size(),
+            FreqyPacked(ref single) => single.approximate_size(),
+            BoyerMoore(ref single) => single.approximate_size(),
+            AC { ref ac, .. } => ac.heap_bytes(),
+            Packed { ref s, .. } => s.heap_bytes(),
+        }
+    }
+}
+
+impl Matcher {
+    fn prefixes(lits: &Literals) -> Self {
+        let sset = SingleByteSet::prefixes(lits);
+        Matcher::new(lits, sset)
+    }
+
+    fn suffixes(lits: &Literals) -> Self {
+        let sset = SingleByteSet::suffixes(lits);
+        Matcher::new(lits, sset)
+    }
+
+    fn new(lits: &Literals, sset: SingleByteSet) -> Self {
+        if lits.literals().is_empty() {
+            return Matcher::Empty;
+        }
+        if sset.dense.len() >= 26 {
+            // Avoid trying to match a large number of single bytes.
+            // This is *very* sensitive to a frequency analysis comparison
+            // between the bytes in sset and the composition of the haystack.
+            // No matter the size of sset, if its members all are rare in the
+            // haystack, then it'd be worth using it. How to tune this... IDK.
+            // ---AG
+            return Matcher::Empty;
+        }
+        if sset.complete {
+            return Matcher::Bytes(sset);
+        }
+        if lits.literals().len() == 1 {
+            let lit = lits.literals()[0].to_vec();
+            if BoyerMooreSearch::should_use(lit.as_slice()) {
+                return Matcher::BoyerMoore(BoyerMooreSearch::new(lit));
+            } else {
+                return Matcher::FreqyPacked(FreqyPacked::new(lit));
+            }
+        }
+
+        let pats = lits.literals().to_owned();
+        let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii;
+        if lits.literals().len() <= 100 && !is_aho_corasick_fast {
+            let mut builder = packed::Config::new()
+                .match_kind(packed::MatchKind::LeftmostFirst)
+                .builder();
+            if let Some(s) = builder.extend(&pats).build() {
+                return Matcher::Packed { s, lits: pats };
+            }
+        }
+        let ac = AhoCorasickBuilder::new()
+            .match_kind(aho_corasick::MatchKind::LeftmostFirst)
+            .dfa(true)
+            .build_with_size::<u32, _, _>(&pats)
+            .unwrap();
+        Matcher::AC { ac, lits: pats }
+    }
+}
+
+pub enum LiteralIter<'a> {
+    Empty,
+    Bytes(&'a [u8]),
+    Single(&'a [u8]),
+    AC(&'a [Literal]),
+    Packed(&'a [Literal]),
+}
+
+impl<'a> Iterator for LiteralIter<'a> {
+    type Item = &'a [u8];
+
+    fn next(&mut self) -> Option<Self::Item> {
+        match *self {
+            LiteralIter::Empty => None,
+            LiteralIter::Bytes(ref mut many) => {
+                if many.is_empty() {
+                    None
+                } else {
+                    let next = &many[0..1];
+                    *many = &many[1..];
+                    Some(next)
+                }
+            }
+            LiteralIter::Single(ref mut one) => {
+                if one.is_empty() {
+                    None
+                } else {
+                    let next = &one[..];
+                    *one = &[];
+                    Some(next)
+                }
+            }
+            LiteralIter::AC(ref mut lits) => {
+                if lits.is_empty() {
+                    None
+                } else {
+                    let next = &lits[0];
+                    *lits = &lits[1..];
+                    Some(&**next)
+                }
+            }
+            LiteralIter::Packed(ref mut lits) => {
+                if lits.is_empty() {
+                    None
+                } else {
+                    let next = &lits[0];
+                    *lits = &lits[1..];
+                    Some(&**next)
+                }
+            }
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+struct SingleByteSet {
+    sparse: Vec<bool>,
+    dense: Vec<u8>,
+    complete: bool,
+    all_ascii: bool,
+}
+
+impl SingleByteSet {
+    fn new() -> SingleByteSet {
+        SingleByteSet {
+            sparse: vec![false; 256],
+            dense: vec![],
+            complete: true,
+            all_ascii: true,
+        }
+    }
+
+    fn prefixes(lits: &Literals) -> SingleByteSet {
+        let mut sset = SingleByteSet::new();
+        for lit in lits.literals() {
+            sset.complete = sset.complete && lit.len() == 1;
+            if let Some(&b) = lit.get(0) {
+                if !sset.sparse[b as usize] {
+                    if b > 0x7F {
+                        sset.all_ascii = false;
+                    }
+                    sset.dense.push(b);
+                    sset.sparse[b as usize] = true;
+                }
+            }
+        }
+        sset
+    }
+
+    fn suffixes(lits: &Literals) -> SingleByteSet {
+        let mut sset = SingleByteSet::new();
+        for lit in lits.literals() {
+            sset.complete = sset.complete && lit.len() == 1;
+            if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) {
+                if !sset.sparse[b as usize] {
+                    if b > 0x7F {
+                        sset.all_ascii = false;
+                    }
+                    sset.dense.push(b);
+                    sset.sparse[b as usize] = true;
+                }
+            }
+        }
+        sset
+    }
+
+    /// Faster find that special cases certain sizes to use memchr.
+    #[cfg_attr(feature = "perf-inline", inline(always))]
+    fn find(&self, text: &[u8]) -> Option<usize> {
+        match self.dense.len() {
+            0 => None,
+            1 => memchr(self.dense[0], text),
+            2 => memchr2(self.dense[0], self.dense[1], text),
+            3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text),
+            _ => self._find(text),
+        }
+    }
+
+    /// Generic find that works on any sized set.
+    fn _find(&self, haystack: &[u8]) -> Option<usize> {
+        for (i, &b) in haystack.iter().enumerate() {
+            if self.sparse[b as usize] {
+                return Some(i);
+            }
+        }
+        None
+    }
+
+    fn approximate_size(&self) -> usize {
+        (self.dense.len() * mem::size_of::<u8>())
+            + (self.sparse.len() * mem::size_of::<bool>())
+    }
+}
+
+/// Provides an implementation of fast subtring search using frequency
+/// analysis.
+///
+/// memchr is so fast that we do everything we can to keep the loop in memchr
+/// for as long as possible. The easiest way to do this is to intelligently
+/// pick the byte to send to memchr. The best byte is the byte that occurs
+/// least frequently in the haystack. Since doing frequency analysis on the
+/// haystack is far too expensive, we compute a set of fixed frequencies up
+/// front and hard code them in src/freqs.rs. Frequency analysis is done via
+/// scripts/frequencies.py.
+#[derive(Clone, Debug)]
+pub struct FreqyPacked {
+    /// The pattern.
+    pat: Vec<u8>,
+    /// The number of Unicode characters in the pattern. This is useful for
+    /// determining the effective length of a pattern when deciding which
+    /// optimizations to perform. A trailing incomplete UTF-8 sequence counts
+    /// as one character.
+    char_len: usize,
+    /// The rarest byte in the pattern, according to pre-computed frequency
+    /// analysis.
+    rare1: u8,
+    /// The offset of the rarest byte in `pat`.
+    rare1i: usize,
+    /// The second rarest byte in the pattern, according to pre-computed
+    /// frequency analysis. (This may be equivalent to the rarest byte.)
+    ///
+    /// The second rarest byte is used as a type of guard for quickly detecting
+    /// a mismatch after memchr locates an instance of the rarest byte. This
+    /// is a hedge against pathological cases where the pre-computed frequency
+    /// analysis may be off. (But of course, does not prevent *all*
+    /// pathological cases.)
+    rare2: u8,
+    /// The offset of the second rarest byte in `pat`.
+    rare2i: usize,
+}
+
+impl FreqyPacked {
+    fn new(pat: Vec<u8>) -> FreqyPacked {
+        if pat.is_empty() {
+            return FreqyPacked::empty();
+        }
+
+        // Find the rarest two bytes. Try to make them distinct (but it's not
+        // required).
+        let mut rare1 = pat[0];
+        let mut rare2 = pat[0];
+        for b in pat[1..].iter().cloned() {
+            if freq_rank(b) < freq_rank(rare1) {
+                rare1 = b;
+            }
+        }
+        for &b in &pat {
+            if rare1 == rare2 {
+                rare2 = b
+            } else if b != rare1 && freq_rank(b) < freq_rank(rare2) {
+                rare2 = b;
+            }
+        }
+
+        // And find the offsets of their last occurrences.
+        let rare1i = pat.iter().rposition(|&b| b == rare1).unwrap();
+        let rare2i = pat.iter().rposition(|&b| b == rare2).unwrap();
+
+        let char_len = char_len_lossy(&pat);
+        FreqyPacked {
+            pat: pat,
+            char_len: char_len,
+            rare1: rare1,
+            rare1i: rare1i,
+            rare2: rare2,
+            rare2i: rare2i,
+        }
+    }
+
+    fn empty() -> FreqyPacked {
+        FreqyPacked {
+            pat: vec![],
+            char_len: 0,
+            rare1: 0,
+            rare1i: 0,
+            rare2: 0,
+            rare2i: 0,
+        }
+    }
+
+    #[cfg_attr(feature = "perf-inline", inline(always))]
+    pub fn find(&self, haystack: &[u8]) -> Option<usize> {
+        let pat = &*self.pat;
+        if haystack.len() < pat.len() || pat.is_empty() {
+            return None;
+        }
+        let mut i = self.rare1i;
+        while i < haystack.len() {
+            i += match memchr(self.rare1, &haystack[i..]) {
+                None => return None,
+                Some(i) => i,
+            };
+            let start = i - self.rare1i;
+            let end = start + pat.len();
+            if end > haystack.len() {
+                return None;
+            }
+            let aligned = &haystack[start..end];
+            if aligned[self.rare2i] == self.rare2 && aligned == &*self.pat {
+                return Some(start);
+            }
+            i += 1;
+        }
+        None
+    }
+
+    #[cfg_attr(feature = "perf-inline", inline(always))]
+    pub fn is_suffix(&self, text: &[u8]) -> bool {
+        if text.len() < self.len() {
+            return false;
+        }
+        text[text.len() - self.len()..] == *self.pat
+    }
+
+    pub fn len(&self) -> usize {
+        self.pat.len()
+    }
+
+    pub fn char_len(&self) -> usize {
+        self.char_len
+    }
+
+    fn approximate_size(&self) -> usize {
+        self.pat.len() * mem::size_of::<u8>()
+    }
+}
+
+fn char_len_lossy(bytes: &[u8]) -> usize {
+    String::from_utf8_lossy(bytes).chars().count()
+}
+
+/// An implementation of Tuned Boyer-Moore as laid out by
+/// Andrew Hume and Daniel Sunday in "Fast String Searching".
+/// O(n) in the size of the input.
+///
+/// Fast string searching algorithms come in many variations,
+/// but they can generally be described in terms of three main
+/// components.
+///
+/// The skip loop is where the string searcher wants to spend
+/// as much time as possible. Exactly which character in the
+/// pattern the skip loop examines varies from algorithm to
+/// algorithm, but in the simplest case this loop repeated
+/// looks at the last character in the pattern and jumps
+/// forward in the input if it is not in the pattern.
+/// Robert Boyer and J Moore called this the "fast" loop in
+/// their original paper.
+///
+/// The match loop is responsible for actually examining the
+/// whole potentially matching substring. In order to fail
+/// faster, the match loop sometimes has a guard test attached.
+/// The guard test uses frequency analysis of the different
+/// characters in the pattern to choose the least frequency
+/// occurring character and use it to find match failures
+/// as quickly as possible.
+///
+/// The shift rule governs how the algorithm will shuffle its
+/// test window in the event of a failure during the match loop.
+/// Certain shift rules allow the worst-case run time of the
+/// algorithm to be shown to be O(n) in the size of the input
+/// rather than O(nm) in the size of the input and the size
+/// of the pattern (as naive Boyer-Moore is).
+///
+/// "Fast String Searching", in addition to presenting a tuned
+/// algorithm, provides a comprehensive taxonomy of the many
+/// different flavors of string searchers. Under that taxonomy
+/// TBM, the algorithm implemented here, uses an unrolled fast
+/// skip loop with memchr fallback, a forward match loop with guard,
+/// and the mini Sunday's delta shift rule. To unpack that you'll have to
+/// read the paper.
+#[derive(Clone, Debug)]
+pub struct BoyerMooreSearch {
+    /// The pattern we are going to look for in the haystack.
+    pattern: Vec<u8>,
+
+    /// The skip table for the skip loop.
+    ///
+    /// Maps the character at the end of the input
+    /// to a shift.
+    skip_table: Vec<usize>,
+
+    /// The guard character (least frequently occurring char).
+    guard: u8,
+    /// The reverse-index of the guard character in the pattern.
+    guard_reverse_idx: usize,
+
+    /// Daniel Sunday's mini generalized delta2 shift table.
+    ///
+    /// We use a skip loop, so we only have to provide a shift
+    /// for the skip char (last char). This is why it is a mini
+    /// shift rule.
+    md2_shift: usize,
+}
+
+impl BoyerMooreSearch {
+    /// Create a new string searcher, performing whatever
+    /// compilation steps are required.
+    fn new(pattern: Vec<u8>) -> Self {
+        debug_assert!(!pattern.is_empty());
+
+        let (g, gi) = Self::select_guard(pattern.as_slice());
+        let skip_table = Self::compile_skip_table(pattern.as_slice());
+        let md2_shift = Self::compile_md2_shift(pattern.as_slice());
+        BoyerMooreSearch {
+            pattern: pattern,
+            skip_table: skip_table,
+            guard: g,
+            guard_reverse_idx: gi,
+            md2_shift: md2_shift,
+        }
+    }
+
+    /// Find the pattern in `haystack`, returning the offset
+    /// of the start of the first occurrence of the pattern
+    /// in `haystack`.
+    #[inline]
+    fn find(&self, haystack: &[u8]) -> Option<usize> {
+        if haystack.len() < self.pattern.len() {
+            return None;
+        }
+
+        let mut window_end = self.pattern.len() - 1;
+
+        // Inspired by the grep source. It is a way
+        // to do correct loop unrolling without having to place
+        // a crashpad of terminating charicters at the end in
+        // the way described in the Fast String Searching paper.
+        const NUM_UNROLL: usize = 10;
+        // 1 for the initial position, and 1 for the md2 shift
+        let short_circut = (NUM_UNROLL + 2) * self.pattern.len();
+
+        if haystack.len() > short_circut {
+            // just 1 for the md2 shift
+            let backstop =
+                haystack.len() - ((NUM_UNROLL + 1) * self.pattern.len());
+            loop {
+                window_end =
+                    match self.skip_loop(haystack, window_end, backstop) {
+                        Some(i) => i,
+                        None => return None,
+                    };
+                if window_end >= backstop {
+                    break;
+                }
+
+                if self.check_match(haystack, window_end) {
+                    return Some(window_end - (self.pattern.len() - 1));
+                } else {
+                    let skip = self.skip_table[haystack[window_end] as usize];
+                    window_end +=
+                        if skip == 0 { self.md2_shift } else { skip };
+                    continue;
+                }
+            }
+        }
+
+        // now process the input after the backstop
+        while window_end < haystack.len() {
+            let mut skip = self.skip_table[haystack[window_end] as usize];
+            if skip == 0 {
+                if self.check_match(haystack, window_end) {
+                    return Some(window_end - (self.pattern.len() - 1));
+                } else {
+                    skip = self.md2_shift;
+                }
+            }
+            window_end += skip;
+        }
+
+        None
+    }
+
+    fn len(&self) -> usize {
+        return self.pattern.len();
+    }
+
+    /// The key heuristic behind which the BoyerMooreSearch lives.
+    ///
+    /// See `rust-lang/regex/issues/408`.
+    ///
+    /// Tuned Boyer-Moore is actually pretty slow! It turns out a handrolled
+    /// platform-specific memchr routine with a bit of frequency
+    /// analysis sprinkled on top actually wins most of the time.
+    /// However, there are a few cases where Tuned Boyer-Moore still
+    /// wins.
+    ///
+    /// If the haystack is random, frequency analysis doesn't help us,
+    /// so Boyer-Moore will win for sufficiently large needles.
+    /// Unfortunately, there is no obvious way to determine this
+    /// ahead of time.
+    ///
+    /// If the pattern itself consists of very common characters,
+    /// frequency analysis won't get us anywhere. The most extreme
+    /// example of this is a pattern like `eeeeeeeeeeeeeeee`. Fortunately,
+    /// this case is wholly determined by the pattern, so we can actually
+    /// implement the heuristic.
+    ///
+    /// A third case is if the pattern is sufficiently long. The idea
+    /// here is that once the pattern gets long enough the Tuned
+    /// Boyer-Moore skip loop will start making strides long enough
+    /// to beat the asm deep magic that is memchr.
+    fn should_use(pattern: &[u8]) -> bool {
+        // The minimum pattern length required to use TBM.
+        const MIN_LEN: usize = 9;
+        // The minimum frequency rank (lower is rarer) that every byte in the
+        // pattern must have in order to use TBM. That is, if the pattern
+        // contains _any_ byte with a lower rank, then TBM won't be used.
+        const MIN_CUTOFF: usize = 150;
+        // The maximum frequency rank for any byte.
+        const MAX_CUTOFF: usize = 255;
+        // The scaling factor used to determine the actual cutoff frequency
+        // to use (keeping in mind that the minimum frequency rank is bounded
+        // by MIN_CUTOFF). This scaling factor is an attempt to make TBM more
+        // likely to be used as the pattern grows longer. That is, longer
+        // patterns permit somewhat less frequent bytes than shorter patterns,
+        // under the assumption that TBM gets better as the pattern gets
+        // longer.
+        const LEN_CUTOFF_PROPORTION: usize = 4;
+
+        let scaled_rank = pattern.len().wrapping_mul(LEN_CUTOFF_PROPORTION);
+        let cutoff = cmp::max(
+            MIN_CUTOFF,
+            MAX_CUTOFF - cmp::min(MAX_CUTOFF, scaled_rank),
+        );
+        // The pattern must be long enough to be worthwhile. e.g., memchr will
+        // be faster on `e` because it is short even though e is quite common.
+        pattern.len() > MIN_LEN
+            // all the bytes must be more common than the cutoff.
+            && pattern.iter().all(|c| freq_rank(*c) >= cutoff)
+    }
+
+    /// Check to see if there is a match at the given position
+    #[inline]
+    fn check_match(&self, haystack: &[u8], window_end: usize) -> bool {
+        // guard test
+        if haystack[window_end - self.guard_reverse_idx] != self.guard {
+            return false;
+        }
+
+        // match loop
+        let window_start = window_end - (self.pattern.len() - 1);
+        for i in 0..self.pattern.len() {
+            if self.pattern[i] != haystack[window_start + i] {
+                return false;
+            }
+        }
+
+        true
+    }
+
+    /// Skip forward according to the shift table.
+    ///
+    /// Returns the offset of the next occurrence
+    /// of the last char in the pattern, or the none
+    /// if it never reappears. If `skip_loop` hits the backstop
+    /// it will leave early.
+    #[inline]
+    fn skip_loop(
+        &self,
+        haystack: &[u8],
+        mut window_end: usize,
+        backstop: usize,
+    ) -> Option<usize> {
+        let window_end_snapshot = window_end;
+        let skip_of = |we: usize| -> usize {
+            // Unsafe might make this faster, but the benchmarks
+            // were hard to interpret.
+            self.skip_table[haystack[we] as usize]
+        };
+
+        loop {
+            let mut skip = skip_of(window_end);
+            window_end += skip;
+            skip = skip_of(window_end);
+            window_end += skip;
+            if skip != 0 {
+                skip = skip_of(window_end);
+                window_end += skip;
+                skip = skip_of(window_end);
+                window_end += skip;
+                skip = skip_of(window_end);
+                window_end += skip;
+                if skip != 0 {
+                    skip = skip_of(window_end);
+                    window_end += skip;
+                    skip = skip_of(window_end);
+                    window_end += skip;
+                    skip = skip_of(window_end);
+                    window_end += skip;
+                    if skip != 0 {
+                        skip = skip_of(window_end);
+                        window_end += skip;
+                        skip = skip_of(window_end);
+                        window_end += skip;
+
+                        // If ten iterations did not make at least 16 words
+                        // worth of progress, we just fall back on memchr.
+                        if window_end - window_end_snapshot
+                            > 16 * mem::size_of::<usize>()
+                        {
+                            // Returning a window_end >= backstop will
+                            // immediatly break us out of the inner loop in
+                            // `find`.
+                            if window_end >= backstop {
+                                return Some(window_end);
+                            }
+
+                            continue; // we made enough progress
+                        } else {
+                            // In case we are already there, and so that
+                            // we will catch the guard char.
+                            window_end = window_end
+                                .checked_sub(1 + self.guard_reverse_idx)
+                                .unwrap_or(0);
+
+                            match memchr(self.guard, &haystack[window_end..]) {
+                                None => return None,
+                                Some(g_idx) => {
+                                    return Some(
+                                        window_end
+                                            + g_idx
+                                            + self.guard_reverse_idx,
+                                    );
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+
+            return Some(window_end);
+        }
+    }
+
+    /// Compute the ufast skip table.
+    fn compile_skip_table(pattern: &[u8]) -> Vec<usize> {
+        let mut tab = vec![pattern.len(); 256];
+
+        // For every char in the pattern, we write a skip
+        // that will line us up with the rightmost occurrence.
+        //
+        // N.B. the sentinel (0) is written by the last
+        // loop iteration.
+        for (i, c) in pattern.iter().enumerate() {
+            tab[*c as usize] = (pattern.len() - 1) - i;
+        }
+
+        tab
+    }
+
+    /// Select the guard character based off of the precomputed
+    /// frequency table.
+    fn select_guard(pattern: &[u8]) -> (u8, usize) {
+        let mut rarest = pattern[0];
+        let mut rarest_rev_idx = pattern.len() - 1;
+        for (i, c) in pattern.iter().enumerate() {
+            if freq_rank(*c) < freq_rank(rarest) {
+                rarest = *c;
+                rarest_rev_idx = (pattern.len() - 1) - i;
+            }
+        }
+
+        (rarest, rarest_rev_idx)
+    }
+
+    /// If there is another occurrence of the skip
+    /// char, shift to it, otherwise just shift to
+    /// the next window.
+    fn compile_md2_shift(pattern: &[u8]) -> usize {
+        let shiftc = *pattern.last().unwrap();
+
+        // For a pattern of length 1 we will never apply the
+        // shift rule, so we use a poison value on the principle
+        // that failing fast is a good thing.
+        if pattern.len() == 1 {
+            return 0xDEADBEAF;
+        }
+
+        let mut i = pattern.len() - 2;
+        while i > 0 {
+            if pattern[i] == shiftc {
+                return (pattern.len() - 1) - i;
+            }
+            i -= 1;
+        }
+
+        // The skip char never re-occurs in the pattern, so
+        // we can just shift the whole window length.
+        pattern.len() - 1
+    }
+
+    fn approximate_size(&self) -> usize {
+        (self.pattern.len() * mem::size_of::<u8>())
+            + (256 * mem::size_of::<usize>()) // skip table
+    }
+}
+
+fn freq_rank(b: u8) -> usize {
+    BYTE_FREQUENCIES[b as usize] as usize
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{BoyerMooreSearch, FreqyPacked};
+
+    //
+    // Unit Tests
+    //
+
+    // The "hello, world" of string searching
+    #[test]
+    fn bm_find_subs() {
+        let searcher = BoyerMooreSearch::new(Vec::from(&b"pattern"[..]));
+        let haystack = b"I keep seeing patterns in this text";
+        assert_eq!(14, searcher.find(haystack).unwrap());
+    }
+
+    #[test]
+    fn bm_find_no_subs() {
+        let searcher = BoyerMooreSearch::new(Vec::from(&b"pattern"[..]));
+        let haystack = b"I keep seeing needles in this text";
+        assert_eq!(None, searcher.find(haystack));
+    }
+
+    //
+    // Regression Tests
+    //
+
+    #[test]
+    fn bm_skip_reset_bug() {
+        let haystack = vec![0, 0, 0, 0, 0, 1, 1, 0];
+        let needle = vec![0, 1, 1, 0];
+
+        let searcher = BoyerMooreSearch::new(needle);
+        let offset = searcher.find(haystack.as_slice()).unwrap();
+        assert_eq!(4, offset);
+    }
+
+    #[test]
+    fn bm_backstop_underflow_bug() {
+        let haystack = vec![0, 0];
+        let needle = vec![0, 0];
+
+        let searcher = BoyerMooreSearch::new(needle);
+        let offset = searcher.find(haystack.as_slice()).unwrap();
+        assert_eq!(0, offset);
+    }
+
+    #[test]
+    fn bm_naive_off_by_one_bug() {
+        let haystack = vec![91];
+        let needle = vec![91];
+
+        let naive_offset = naive_find(&needle, &haystack).unwrap();
+        assert_eq!(0, naive_offset);
+    }
+
+    #[test]
+    fn bm_memchr_fallback_indexing_bug() {
+        let mut haystack = vec![
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        ];
+        let needle = vec![1, 1, 1, 1, 32, 32, 87];
+        let needle_start = haystack.len();
+        haystack.extend(needle.clone());
+
+        let searcher = BoyerMooreSearch::new(needle);
+        assert_eq!(needle_start, searcher.find(haystack.as_slice()).unwrap());
+    }
+
+    #[test]
+    fn bm_backstop_boundary() {
+        let haystack = b"\
+// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+e_data.clone_created(entity_id, entity_to_add.entity_id);
+aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+"
+        .to_vec();
+        let needle = b"clone_created".to_vec();
+
+        let searcher = BoyerMooreSearch::new(needle);
+        let result = searcher.find(&haystack);
+        assert_eq!(Some(43), result);
+    }
+
+    #[test]
+    fn bm_win_gnu_indexing_bug() {
+        let haystack_raw = vec![
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        ];
+        let needle = vec![1, 1, 1, 1, 1, 1, 1];
+        let haystack = haystack_raw.as_slice();
+
+        BoyerMooreSearch::new(needle.clone()).find(haystack);
+    }
+
+    //
+    // QuickCheck Properties
+    //
+
+    use quickcheck::TestResult;
+
+    fn naive_find(needle: &[u8], haystack: &[u8]) -> Option<usize> {
+        assert!(needle.len() <= haystack.len());
+
+        for i in 0..(haystack.len() - (needle.len() - 1)) {
+            if haystack[i] == needle[0]
+                && &haystack[i..(i + needle.len())] == needle
+            {
+                return Some(i);
+            }
+        }
+
+        None
+    }
+
+    quickcheck! {
+        fn qc_bm_equals_nieve_find(pile1: Vec<u8>, pile2: Vec<u8>) -> TestResult {
+            if pile1.len() == 0 || pile2.len() == 0 {
+                return TestResult::discard();
+            }
+
+            let (needle, haystack) = if pile1.len() < pile2.len() {
+                (pile1, pile2.as_slice())
+            } else {
+                (pile2, pile1.as_slice())
+            };
+
+            let searcher = BoyerMooreSearch::new(needle.clone());
+            TestResult::from_bool(
+                searcher.find(haystack) == naive_find(&needle, haystack))
+        }
+
+        fn qc_bm_equals_single(pile1: Vec<u8>, pile2: Vec<u8>) -> TestResult {
+            if pile1.len() == 0 || pile2.len() == 0 {
+                return TestResult::discard();
+            }
+
+            let (needle, haystack) = if pile1.len() < pile2.len() {
+                (pile1, pile2.as_slice())
+            } else {
+                (pile2, pile1.as_slice())
+            };
+
+            let bm_searcher = BoyerMooreSearch::new(needle.clone());
+            let freqy_memchr = FreqyPacked::new(needle);
+            TestResult::from_bool(
+                bm_searcher.find(haystack) == freqy_memchr.find(haystack))
+        }
+
+        fn qc_bm_finds_trailing_needle(
+            haystack_pre: Vec<u8>,
+            needle: Vec<u8>
+        ) -> TestResult {
+            if needle.len() == 0 {
+                return TestResult::discard();
+            }
+
+            let mut haystack = haystack_pre.clone();
+            let searcher = BoyerMooreSearch::new(needle.clone());
+
+            if haystack.len() >= needle.len() &&
+                searcher.find(haystack.as_slice()).is_some() {
+                return TestResult::discard();
+            }
+
+            haystack.extend(needle.clone());
+
+            // What if the the tail of the haystack can start the
+            // needle?
+            let start = haystack_pre.len()
+                .checked_sub(needle.len())
+                .unwrap_or(0);
+            for i in 0..(needle.len() - 1) {
+                if searcher.find(&haystack[(i + start)..]).is_some() {
+                    return TestResult::discard();
+                }
+            }
+
+            TestResult::from_bool(
+                searcher.find(haystack.as_slice())
+                        .map(|x| x == haystack_pre.len())
+                        .unwrap_or(false))
+        }
+
+        // qc_equals_* is only testing the negative case as @burntsushi
+        // pointed out in https://github.com/rust-lang/regex/issues/446.
+        // This quickcheck prop represents an effort to force testing of
+        // the positive case. qc_bm_finds_first and qc_bm_finds_trailing_needle
+        // already check some of the positive cases, but they don't cover
+        // cases where the needle is in the middle of haystack. This prop
+        // fills that hole.
+        fn qc_bm_finds_subslice(
+            haystack: Vec<u8>,
+            needle_start: usize,
+            needle_length: usize
+        ) -> TestResult {
+            if haystack.len() == 0 {
+                return TestResult::discard();
+            }
+
+            let needle_start = needle_start % haystack.len();
+            let needle_length = needle_length % (haystack.len() - needle_start);
+
+            if needle_length == 0 {
+                return TestResult::discard();
+            }
+
+            let needle = &haystack[needle_start..(needle_start + needle_length)];
+
+            let bm_searcher = BoyerMooreSearch::new(needle.to_vec());
+
+            let start = naive_find(&needle, &haystack);
+            match start {
+                None => TestResult::from_bool(false),
+                Some(nf_start) =>
+                    TestResult::from_bool(
+                        nf_start <= needle_start
+                            && bm_searcher.find(&haystack) == start
+                    )
+            }
+        }
+
+        fn qc_bm_finds_first(needle: Vec<u8>) -> TestResult {
+            if needle.len() == 0 {
+                return TestResult::discard();
+            }
+
+            let mut haystack = needle.clone();
+            let searcher = BoyerMooreSearch::new(needle.clone());
+            haystack.extend(needle);
+
+            TestResult::from_bool(
+                searcher.find(haystack.as_slice())
+                        .map(|x| x == 0)
+                        .unwrap_or(false))
+        }
+    }
+}
commit	e42c505f54ac2e7b2ca7d0197304cac1b4f605e9	[log] [tgz]
author	Chih-Hung Hsieh <chh@google.com>	Thu Apr 16 10:44:21 2020 -0700
committer	Chih-Hung Hsieh <chh@google.com>	Thu Apr 16 10:45:53 2020 -0700
tree	7ca22ebdbdb7d0c9dc5c66d31f564aeaf51cece4
parent	815d0a2344b5321fd47f0cb7dba96e4cb4e84615 [diff] [blame]