Upgrade rust/crates/regex to 1.4.1 Test: make Change-Id: Ie12506e8c765f9dd8043cd61b0c77bebd887302e

commit: 849e4457e7a77114ede925233df78a9dc6ad665b [log] [tgz]
author: Chih-Hung Hsieh <chh@google.com> Mon Oct 26 13:16:47 2020 -0700
committer: Chih-Hung Hsieh <chh@google.com> Mon Oct 26 13:16:47 2020 -0700
tree: e873b7eb9bb377a84845126bfbbbc5301481d3bc
parent: 01a9012e518e5502dc54ae36ed78610a492a4737 [diff]
diff --git a/src/compile.rs b/src/compile.rs
index ad54040..cdc583c 100644
--- a/src/compile.rs
+++ b/src/compile.rs

@@ -222,7 +222,7 @@
     ///                                         hole
     /// ```
     ///
-    /// To compile two expressions, e1 and e2, concatinated together we
+    /// To compile two expressions, e1 and e2, concatenated together we
     /// would do:
     ///
     /// ```ignore

diff --git a/src/dfa.rs b/src/dfa.rs
index decc3b9..2a365ee 100644
--- a/src/dfa.rs
+++ b/src/dfa.rs

@@ -679,7 +679,7 @@
                 }
             } else if next_si & STATE_START > 0 {
                 // A start state isn't in the common case because we may
-                // what to do quick prefix scanning. If the program doesn't
+                // want to do quick prefix scanning. If the program doesn't
                 // have a detected prefix, then start states are actually
                 // considered common and this case is never reached.
                 debug_assert!(self.has_prefix());
@@ -725,7 +725,7 @@
             }
         }
 
-        // Run the DFA once more on the special EOF senitnel value.
+        // Run the DFA once more on the special EOF sentinel value.
         // We don't care about the special bits in the state pointer any more,
         // so get rid of them.
         prev_si &= STATE_MAX;
@@ -830,7 +830,7 @@
             }
         }
 
-        // Run the DFA once more on the special EOF senitnel value.
+        // Run the DFA once more on the special EOF sentinel value.
         prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) {
             None => return Result::Quit,
             Some(STATE_DEAD) => return result.set_non_match(0),
@@ -913,8 +913,8 @@
         if self.state(si).flags().has_empty() {
             // Compute the flags immediately preceding the current byte.
             // This means we only care about the "end" or "end line" flags.
-            // (The "start" flags are computed immediately proceding the
-            // current byte and is handled below.)
+            // (The "start" flags are computed immediately following the
+            // current byte and are handled below.)
             let mut flags = EmptyFlags::default();
             if b.is_eof() {
                 flags.end = true;
@@ -1048,7 +1048,7 @@
     ///
     /// If matching starts after the beginning of the input, then only start
     /// line should be set if the preceding byte is `\n`. End line should never
-    /// be set in this case. (Even if the proceding byte is a `\n`, it will
+    /// be set in this case. (Even if the following byte is a `\n`, it will
     /// be handled in a subsequent DFA state.)
     fn follow_epsilons(
         &mut self,

diff --git a/src/expand.rs b/src/expand.rs
index 528f55e..fd2ab03 100644
--- a/src/expand.rs
+++ b/src/expand.rs

@@ -24,7 +24,7 @@
             continue;
         }
         debug_assert!(!replacement.is_empty());
-        let cap_ref = match find_cap_ref(replacement) {
+        let cap_ref = match find_cap_ref(replacement.as_bytes()) {
             Some(cap_ref) => cap_ref,
             None => {
                 dst.push_str("$");
@@ -125,19 +125,15 @@
 /// starting at the beginning of `replacement`.
 ///
 /// If no such valid reference could be found, None is returned.
-fn find_cap_ref<T: ?Sized + AsRef<[u8]>>(
-    replacement: &T,
-) -> Option<CaptureRef> {
+fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef> {
     let mut i = 0;
     let rep: &[u8] = replacement.as_ref();
     if rep.len() <= 1 || rep[0] != b'$' {
         return None;
     }
-    let mut brace = false;
     i += 1;
     if rep[i] == b'{' {
-        brace = true;
-        i += 1;
+        return find_cap_ref_braced(rep, i + 1);
     }
     let mut cap_end = i;
     while rep.get(cap_end).map_or(false, is_valid_cap_letter) {
@@ -151,12 +147,6 @@
     // check with either unsafe or by parsing the number straight from &[u8].
     let cap =
         str::from_utf8(&rep[i..cap_end]).expect("valid UTF-8 capture name");
-    if brace {
-        if !rep.get(cap_end).map_or(false, |&b| b == b'}') {
-            return None;
-        }
-        cap_end += 1;
-    }
     Some(CaptureRef {
         cap: match cap.parse::<u32>() {
             Ok(i) => Ref::Number(i as usize),
@@ -166,6 +156,31 @@
     })
 }
 
+fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef> {
+    let start = i;
+    while rep.get(i).map_or(false, |&b| b != b'}') {
+        i += 1;
+    }
+    if !rep.get(i).map_or(false, |&b| b == b'}') {
+        return None;
+    }
+    // When looking at braced names, we don't put any restrictions on the name,
+    // so it's possible it could be invalid UTF-8. But a capture group name
+    // can never be invalid UTF-8, so if we have invalid UTF-8, then we can
+    // safely return None.
+    let cap = match str::from_utf8(&rep[start..i]) {
+        Err(_) => return None,
+        Ok(cap) => cap,
+    };
+    Some(CaptureRef {
+        cap: match cap.parse::<u32>() {
+            Ok(i) => Ref::Number(i as usize),
+            Err(_) => Ref::Named(cap),
+        },
+        end: i + 1,
+    })
+}
+
 /// Returns true if and only if the given byte is allowed in a capture name.
 fn is_valid_cap_letter(b: &u8) -> bool {
     match *b {
@@ -182,13 +197,13 @@
         ($name:ident, $text:expr) => {
             #[test]
             fn $name() {
-                assert_eq!(None, find_cap_ref($text));
+                assert_eq!(None, find_cap_ref($text.as_bytes()));
             }
         };
         ($name:ident, $text:expr, $capref:expr) => {
             #[test]
             fn $name() {
-                assert_eq!(Some($capref), find_cap_ref($text));
+                assert_eq!(Some($capref), find_cap_ref($text.as_bytes()));
             }
         };
     }
@@ -204,7 +219,8 @@
     find!(find_cap_ref3, "$0", c!(0, 2));
     find!(find_cap_ref4, "$5", c!(5, 2));
     find!(find_cap_ref5, "$10", c!(10, 3));
-    // see https://github.com/rust-lang/regex/pull/585 for more on characters following numbers
+    // See https://github.com/rust-lang/regex/pull/585
+    // for more on characters following numbers
     find!(find_cap_ref6, "$42a", c!("42a", 4));
     find!(find_cap_ref7, "${42}a", c!(42, 5));
     find!(find_cap_ref8, "${42");
@@ -217,4 +233,6 @@
     find!(find_cap_ref15, "$1_$2", c!("1_", 3));
     find!(find_cap_ref16, "$x-$y", c!("x", 2));
     find!(find_cap_ref17, "$x_$y", c!("x_", 3));
+    find!(find_cap_ref18, "${#}", c!("#", 4));
+    find!(find_cap_ref19, "${Z[}", c!("Z[", 5));
 }

diff --git a/src/lib.rs b/src/lib.rs
index e0a0975..bdcebd4 100644
--- a/src/lib.rs
+++ b/src/lib.rs

@@ -365,7 +365,7 @@
 
 <pre class="rust">
 (exp)          numbered capture group (indexed by opening parenthesis)
-(?P&lt;name&gt;exp)  named (also numbered) capture group (allowed chars: [_0-9a-zA-Z])
+(?P&lt;name&gt;exp)  named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
 (?:exp)        non-capturing group
 (?flags)       set flags within current group
 (?flags:exp)   set flags for exp (non-capturing)
@@ -562,7 +562,7 @@
   [Unicode's "simple loose matches" specification](https://www.unicode.org/reports/tr18/#Simple_Loose_Matches).
 * **unicode-gencat** -
   Provide the data for
-  [Uncode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values).
+  [Unicode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values).
   This includes, but is not limited to, `Decimal_Number`, `Letter`,
   `Math_Symbol`, `Number` and `Punctuation`.
 * **unicode-perl** -
@@ -731,8 +731,8 @@
 literal byte `\xFF`, while in Unicode mode, `\xFF` is a Unicode codepoint that
 matches its UTF-8 encoding of `\xC3\xBF`. Similarly for octal notation when
 enabled.
-6. `.` matches any *byte* except for `\n` instead of any Unicode scalar value.
-When the `s` flag is enabled, `.` matches any byte.
+6. In ASCII compatible mode, `.` matches any *byte* except for `\n`. When the
+`s` flag is additionally enabled, `.` matches any byte.
 
 # Performance
 

diff --git a/src/pikevm.rs b/src/pikevm.rs
index c106c76..299087d 100644
--- a/src/pikevm.rs
+++ b/src/pikevm.rs

@@ -8,7 +8,7 @@
 //
 // It can do more than the DFA can (specifically, record capture locations
 // and execute Unicode word boundary assertions), but at a slower speed.
-// Specifically, the Pike VM exectues a DFA implicitly by repeatedly expanding
+// Specifically, the Pike VM executes a DFA implicitly by repeatedly expanding
 // epsilon transitions. That is, the Pike VM engine can be in multiple states
 // at once where as the DFA is only ever in one state at a time.
 //

diff --git a/src/re_bytes.rs b/src/re_bytes.rs
index 69f0b33..ca01e0e 100644
--- a/src/re_bytes.rs
+++ b/src/re_bytes.rs

@@ -119,7 +119,8 @@
         RegexBuilder::new(re).build()
     }
 
-    /// Returns true if and only if the regex matches the string given.
+    /// Returns true if and only if there is a match for the regex in the
+    /// string given.
     ///
     /// It is recommended to use this method if all you need to do is test
     /// a match, since the underlying matching engine may be able to do less
@@ -930,17 +931,22 @@
     /// Expands all instances of `$name` in `replacement` to the corresponding
     /// capture group `name`, and writes them to the `dst` buffer given.
     ///
-    /// `name` may be an integer corresponding to the index of the
-    /// capture group (counted by order of opening parenthesis where `0` is the
+    /// `name` may be an integer corresponding to the index of the capture
+    /// group (counted by order of opening parenthesis where `0` is the
     /// entire match) or it can be a name (consisting of letters, digits or
     /// underscores) corresponding to a named capture group.
     ///
     /// If `name` isn't a valid capture group (whether the name doesn't exist
     /// or isn't a valid index), then it is replaced with the empty string.
     ///
-    /// The longest possible name is used. e.g., `$1a` looks up the capture
-    /// group named `1a` and not the capture group at index `1`. To exert more
-    /// precise control over the name, use braces, e.g., `${1}a`.
+    /// The longest possible name consisting of the characters `[_0-9A-Za-z]`
+    /// is used. e.g., `$1a` looks up the capture group named `1a` and not the
+    /// capture group at index `1`. To exert more precise control over the
+    /// name, or to refer to a capture group name that uses characters outside
+    /// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When
+    /// using braces, any sequence of valid UTF-8 bytes is permitted. If the
+    /// sequence does not refer to a capture group name in the corresponding
+    /// regex, then it is replaced with an empty string.
     ///
     /// To write a literal `$` use `$$`.
     pub fn expand(&self, replacement: &[u8], dst: &mut Vec<u8>) {
@@ -1051,6 +1057,7 @@
 ///
 /// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and
 /// the lifetime `'t` corresponds to the originally matched text.
+#[derive(Clone)]
 pub struct SubCaptureMatches<'c, 't: 'c> {
     caps: &'c Captures<'t>,
     it: SubCapturesPosIter<'c>,

diff --git a/src/re_set.rs b/src/re_set.rs
index fc2b61a..b8954be 100644
--- a/src/re_set.rs
+++ b/src/re_set.rs

@@ -96,6 +96,19 @@
         RegexSetBuilder::new(exprs).build()
     }
 
+    /// Create a new empty regex set.
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// # use regex::RegexSet;
+    /// let set = RegexSet::empty();
+    /// assert!(set.is_empty());
+    /// ```
+    pub fn empty() -> RegexSet {
+        RegexSetBuilder::new(&[""; 0]).build().unwrap()
+    }
+
     /// Returns true if and only if one of the regexes in this set matches
     /// the text given.
     ///
@@ -207,6 +220,11 @@
         self.0.regex_strings().len()
     }
 
+    /// Returns `true` if this set contains no regular expressions.
+    pub fn is_empty(&self) -> bool {
+        self.0.regex_strings().is_empty()
+    }
+
     /// Returns the patterns that this set will match on.
     ///
     /// This function can be used to determine the pattern for a match. The

diff --git a/src/re_trait.rs b/src/re_trait.rs
index b56804e..d14a9f7 100644
--- a/src/re_trait.rs
+++ b/src/re_trait.rs

@@ -51,6 +51,7 @@
 /// Positions are byte indices in terms of the original string matched.
 ///
 /// `'c` is the lifetime of the captures.
+#[derive(Clone)]
 pub struct SubCapturesPosIter<'c> {
     idx: usize,
     locs: &'c Locations,

diff --git a/src/re_unicode.rs b/src/re_unicode.rs
index b746599..ea95c1b 100644
--- a/src/re_unicode.rs
+++ b/src/re_unicode.rs

@@ -175,7 +175,8 @@
         RegexBuilder::new(re).build()
     }
 
-    /// Returns true if and only if the regex matches the string given.
+    /// Returns true if and only if there is a match for the regex in the
+    /// string given.
     ///
     /// It is recommended to use this method if all you need to do is test
     /// a match, since the underlying matching engine may be able to do less
@@ -947,17 +948,22 @@
     /// Expands all instances of `$name` in `replacement` to the corresponding
     /// capture group `name`, and writes them to the `dst` buffer given.
     ///
-    /// `name` may be an integer corresponding to the index of the
-    /// capture group (counted by order of opening parenthesis where `0` is the
+    /// `name` may be an integer corresponding to the index of the capture
+    /// group (counted by order of opening parenthesis where `0` is the
     /// entire match) or it can be a name (consisting of letters, digits or
     /// underscores) corresponding to a named capture group.
     ///
     /// If `name` isn't a valid capture group (whether the name doesn't exist
     /// or isn't a valid index), then it is replaced with the empty string.
     ///
-    /// The longest possible name is used. e.g., `$1a` looks up the capture
-    /// group named `1a` and not the capture group at index `1`. To exert more
-    /// precise control over the name, use braces, e.g., `${1}a`.
+    /// The longest possible name consisting of the characters `[_0-9A-Za-z]`
+    /// is used. e.g., `$1a` looks up the capture group named `1a` and not the
+    /// capture group at index `1`. To exert more precise control over the
+    /// name, or to refer to a capture group name that uses characters outside
+    /// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When
+    /// using braces, any sequence of characters is permitted. If the sequence
+    /// does not refer to a capture group name in the corresponding regex, then
+    /// it is replaced with an empty string.
     ///
     /// To write a literal `$` use `$$`.
     pub fn expand(&self, replacement: &str, dst: &mut String) {
@@ -1053,6 +1059,7 @@
 ///
 /// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and
 /// the lifetime `'t` corresponds to the originally matched text.
+#[derive(Clone)]
 pub struct SubCaptureMatches<'c, 't: 'c> {
     caps: &'c Captures<'t>,
     it: SubCapturesPosIter<'c>,
@@ -1122,7 +1129,7 @@
     /// have a match at capture group `0`.
     ///
     /// For example, a no-op replacement would be
-    /// `dst.extend(caps.get(0).unwrap().as_str())`.
+    /// `dst.push_str(caps.get(0).unwrap().as_str())`.
     fn replace_append(&mut self, caps: &Captures, dst: &mut String);
 
     /// Return a fixed unchanging replacement string.
commit	849e4457e7a77114ede925233df78a9dc6ad665b	[log] [tgz]
author	Chih-Hung Hsieh <chh@google.com>	Mon Oct 26 13:16:47 2020 -0700
committer	Chih-Hung Hsieh <chh@google.com>	Mon Oct 26 13:16:47 2020 -0700
tree	e873b7eb9bb377a84845126bfbbbc5301481d3bc
parent	01a9012e518e5502dc54ae36ed78610a492a4737 [diff]