Import 'regex' package vesion 1.3.6 * Add OWNERS * No Android.bp yet Bug: 152884384 Test: make Change-Id: I455caf7833b6c437c1c133bc7b2f47b83da9cbce

commit: e42c505f54ac2e7b2ca7d0197304cac1b4f605e9 [log] [tgz]
author: Chih-Hung Hsieh <chh@google.com> Thu Apr 16 10:44:21 2020 -0700
committer: Chih-Hung Hsieh <chh@google.com> Thu Apr 16 10:45:53 2020 -0700
tree: 7ca22ebdbdb7d0c9dc5c66d31f564aeaf51cece4
parent: 815d0a2344b5321fd47f0cb7dba96e4cb4e84615 [diff] [blame]
diff --git a/src/re_builder.rs b/src/re_builder.rs
new file mode 100644
index 0000000..3fef99d
--- /dev/null
+++ b/src/re_builder.rs

@@ -0,0 +1,419 @@
+/// The set of user configurable options for compiling zero or more regexes.
+#[derive(Clone, Debug)]
+#[allow(missing_docs)]
+pub struct RegexOptions {
+    pub pats: Vec<String>,
+    pub size_limit: usize,
+    pub dfa_size_limit: usize,
+    pub nest_limit: u32,
+    pub case_insensitive: bool,
+    pub multi_line: bool,
+    pub dot_matches_new_line: bool,
+    pub swap_greed: bool,
+    pub ignore_whitespace: bool,
+    pub unicode: bool,
+    pub octal: bool,
+}
+
+impl Default for RegexOptions {
+    fn default() -> Self {
+        RegexOptions {
+            pats: vec![],
+            size_limit: 10 * (1 << 20),
+            dfa_size_limit: 2 * (1 << 20),
+            nest_limit: 250,
+            case_insensitive: false,
+            multi_line: false,
+            dot_matches_new_line: false,
+            swap_greed: false,
+            ignore_whitespace: false,
+            unicode: true,
+            octal: false,
+        }
+    }
+}
+
+macro_rules! define_builder {
+    ($name:ident, $regex_mod:ident, $only_utf8:expr) => {
+        pub mod $name {
+            use super::RegexOptions;
+            use error::Error;
+            use exec::ExecBuilder;
+
+            use $regex_mod::Regex;
+
+            /// A configurable builder for a regular expression.
+            ///
+            /// A builder can be used to configure how the regex is built, for example, by
+            /// setting the default flags (which can be overridden in the expression
+            /// itself) or setting various limits.
+            pub struct RegexBuilder(RegexOptions);
+
+            impl RegexBuilder {
+                /// Create a new regular expression builder with the given pattern.
+                ///
+                /// If the pattern is invalid, then an error will be returned when
+                /// `build` is called.
+                pub fn new(pattern: &str) -> RegexBuilder {
+                    let mut builder = RegexBuilder(RegexOptions::default());
+                    builder.0.pats.push(pattern.to_owned());
+                    builder
+                }
+
+                /// Consume the builder and compile the regular expression.
+                ///
+                /// Note that calling `as_str` on the resulting `Regex` will produce the
+                /// pattern given to `new` verbatim. Notably, it will not incorporate any
+                /// of the flags set on this builder.
+                pub fn build(&self) -> Result<Regex, Error> {
+                    ExecBuilder::new_options(self.0.clone())
+                        .only_utf8($only_utf8)
+                        .build()
+                        .map(Regex::from)
+                }
+
+                /// Set the value for the case insensitive (`i`) flag.
+                ///
+                /// When enabled, letters in the pattern will match both upper case and
+                /// lower case variants.
+                pub fn case_insensitive(
+                    &mut self,
+                    yes: bool,
+                ) -> &mut RegexBuilder {
+                    self.0.case_insensitive = yes;
+                    self
+                }
+
+                /// Set the value for the multi-line matching (`m`) flag.
+                ///
+                /// When enabled, `^` matches the beginning of lines and `$` matches the
+                /// end of lines.
+                ///
+                /// By default, they match beginning/end of the input.
+                pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
+                    self.0.multi_line = yes;
+                    self
+                }
+
+                /// Set the value for the any character (`s`) flag, where in `.` matches
+                /// anything when `s` is set and matches anything except for new line when
+                /// it is not set (the default).
+                ///
+                /// N.B. "matches anything" means "any byte" when Unicode is disabled and
+                /// means "any valid UTF-8 encoding of any Unicode scalar value" when
+                /// Unicode is enabled.
+                pub fn dot_matches_new_line(
+                    &mut self,
+                    yes: bool,
+                ) -> &mut RegexBuilder {
+                    self.0.dot_matches_new_line = yes;
+                    self
+                }
+
+                /// Set the value for the greedy swap (`U`) flag.
+                ///
+                /// When enabled, a pattern like `a*` is lazy (tries to find shortest
+                /// match) and `a*?` is greedy (tries to find longest match).
+                ///
+                /// By default, `a*` is greedy and `a*?` is lazy.
+                pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
+                    self.0.swap_greed = yes;
+                    self
+                }
+
+                /// Set the value for the ignore whitespace (`x`) flag.
+                ///
+                /// When enabled, whitespace such as new lines and spaces will be ignored
+                /// between expressions of the pattern, and `#` can be used to start a
+                /// comment until the next new line.
+                pub fn ignore_whitespace(
+                    &mut self,
+                    yes: bool,
+                ) -> &mut RegexBuilder {
+                    self.0.ignore_whitespace = yes;
+                    self
+                }
+
+                /// Set the value for the Unicode (`u`) flag.
+                ///
+                /// Enabled by default. When disabled, character classes such as `\w` only
+                /// match ASCII word characters instead of all Unicode word characters.
+                pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
+                    self.0.unicode = yes;
+                    self
+                }
+
+                /// Whether to support octal syntax or not.
+                ///
+                /// Octal syntax is a little-known way of uttering Unicode codepoints in
+                /// a regular expression. For example, `a`, `\x61`, `\u0061` and
+                /// `\141` are all equivalent regular expressions, where the last example
+                /// shows octal syntax.
+                ///
+                /// While supporting octal syntax isn't in and of itself a problem, it does
+                /// make good error messages harder. That is, in PCRE based regex engines,
+                /// syntax like `\0` invokes a backreference, which is explicitly
+                /// unsupported in Rust's regex engine. However, many users expect it to
+                /// be supported. Therefore, when octal support is disabled, the error
+                /// message will explicitly mention that backreferences aren't supported.
+                ///
+                /// Octal syntax is disabled by default.
+                pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
+                    self.0.octal = yes;
+                    self
+                }
+
+                /// Set the approximate size limit of the compiled regular expression.
+                ///
+                /// This roughly corresponds to the number of bytes occupied by a single
+                /// compiled program. If the program exceeds this number, then a
+                /// compilation error is returned.
+                pub fn size_limit(
+                    &mut self,
+                    limit: usize,
+                ) -> &mut RegexBuilder {
+                    self.0.size_limit = limit;
+                    self
+                }
+
+                /// Set the approximate size of the cache used by the DFA.
+                ///
+                /// This roughly corresponds to the number of bytes that the DFA will
+                /// use while searching.
+                ///
+                /// Note that this is a *per thread* limit. There is no way to set a global
+                /// limit. In particular, if a regex is used from multiple threads
+                /// simultaneously, then each thread may use up to the number of bytes
+                /// specified here.
+                pub fn dfa_size_limit(
+                    &mut self,
+                    limit: usize,
+                ) -> &mut RegexBuilder {
+                    self.0.dfa_size_limit = limit;
+                    self
+                }
+
+                /// Set the nesting limit for this parser.
+                ///
+                /// The nesting limit controls how deep the abstract syntax tree is allowed
+                /// to be. If the AST exceeds the given limit (e.g., with too many nested
+                /// groups), then an error is returned by the parser.
+                ///
+                /// The purpose of this limit is to act as a heuristic to prevent stack
+                /// overflow for consumers that do structural induction on an `Ast` using
+                /// explicit recursion. While this crate never does this (instead using
+                /// constant stack space and moving the call stack to the heap), other
+                /// crates may.
+                ///
+                /// This limit is not checked until the entire Ast is parsed. Therefore,
+                /// if callers want to put a limit on the amount of heap space used, then
+                /// they should impose a limit on the length, in bytes, of the concrete
+                /// pattern string. In particular, this is viable since this parser
+                /// implementation will limit itself to heap space proportional to the
+                /// length of the pattern string.
+                ///
+                /// Note that a nest limit of `0` will return a nest limit error for most
+                /// patterns but not all. For example, a nest limit of `0` permits `a` but
+                /// not `ab`, since `ab` requires a concatenation, which results in a nest
+                /// depth of `1`. In general, a nest limit is not something that manifests
+                /// in an obvious way in the concrete syntax, therefore, it should not be
+                /// used in a granular way.
+                pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
+                    self.0.nest_limit = limit;
+                    self
+                }
+            }
+        }
+    };
+}
+
+define_builder!(bytes, re_bytes, false);
+define_builder!(unicode, re_unicode, true);
+
+macro_rules! define_set_builder {
+    ($name:ident, $regex_mod:ident, $only_utf8:expr) => {
+        pub mod $name {
+            use super::RegexOptions;
+            use error::Error;
+            use exec::ExecBuilder;
+
+            use re_set::$regex_mod::RegexSet;
+
+            /// A configurable builder for a set of regular expressions.
+            ///
+            /// A builder can be used to configure how the regexes are built, for example,
+            /// by setting the default flags (which can be overridden in the expression
+            /// itself) or setting various limits.
+            pub struct RegexSetBuilder(RegexOptions);
+
+            impl RegexSetBuilder {
+                /// Create a new regular expression builder with the given pattern.
+                ///
+                /// If the pattern is invalid, then an error will be returned when
+                /// `build` is called.
+                pub fn new<I, S>(patterns: I) -> RegexSetBuilder
+                where
+                    S: AsRef<str>,
+                    I: IntoIterator<Item = S>,
+                {
+                    let mut builder = RegexSetBuilder(RegexOptions::default());
+                    for pat in patterns {
+                        builder.0.pats.push(pat.as_ref().to_owned());
+                    }
+                    builder
+                }
+
+                /// Consume the builder and compile the regular expressions into a set.
+                pub fn build(&self) -> Result<RegexSet, Error> {
+                    ExecBuilder::new_options(self.0.clone())
+                        .only_utf8($only_utf8)
+                        .build()
+                        .map(RegexSet::from)
+                }
+
+                /// Set the value for the case insensitive (`i`) flag.
+                pub fn case_insensitive(
+                    &mut self,
+                    yes: bool,
+                ) -> &mut RegexSetBuilder {
+                    self.0.case_insensitive = yes;
+                    self
+                }
+
+                /// Set the value for the multi-line matching (`m`) flag.
+                pub fn multi_line(
+                    &mut self,
+                    yes: bool,
+                ) -> &mut RegexSetBuilder {
+                    self.0.multi_line = yes;
+                    self
+                }
+
+                /// Set the value for the any character (`s`) flag, where in `.` matches
+                /// anything when `s` is set and matches anything except for new line when
+                /// it is not set (the default).
+                ///
+                /// N.B. "matches anything" means "any byte" for `regex::bytes::RegexSet`
+                /// expressions and means "any Unicode scalar value" for `regex::RegexSet`
+                /// expressions.
+                pub fn dot_matches_new_line(
+                    &mut self,
+                    yes: bool,
+                ) -> &mut RegexSetBuilder {
+                    self.0.dot_matches_new_line = yes;
+                    self
+                }
+
+                /// Set the value for the greedy swap (`U`) flag.
+                pub fn swap_greed(
+                    &mut self,
+                    yes: bool,
+                ) -> &mut RegexSetBuilder {
+                    self.0.swap_greed = yes;
+                    self
+                }
+
+                /// Set the value for the ignore whitespace (`x`) flag.
+                pub fn ignore_whitespace(
+                    &mut self,
+                    yes: bool,
+                ) -> &mut RegexSetBuilder {
+                    self.0.ignore_whitespace = yes;
+                    self
+                }
+
+                /// Set the value for the Unicode (`u`) flag.
+                pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
+                    self.0.unicode = yes;
+                    self
+                }
+
+                /// Whether to support octal syntax or not.
+                ///
+                /// Octal syntax is a little-known way of uttering Unicode codepoints in
+                /// a regular expression. For example, `a`, `\x61`, `\u0061` and
+                /// `\141` are all equivalent regular expressions, where the last example
+                /// shows octal syntax.
+                ///
+                /// While supporting octal syntax isn't in and of itself a problem, it does
+                /// make good error messages harder. That is, in PCRE based regex engines,
+                /// syntax like `\0` invokes a backreference, which is explicitly
+                /// unsupported in Rust's regex engine. However, many users expect it to
+                /// be supported. Therefore, when octal support is disabled, the error
+                /// message will explicitly mention that backreferences aren't supported.
+                ///
+                /// Octal syntax is disabled by default.
+                pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
+                    self.0.octal = yes;
+                    self
+                }
+
+                /// Set the approximate size limit of the compiled regular expression.
+                ///
+                /// This roughly corresponds to the number of bytes occupied by a single
+                /// compiled program. If the program exceeds this number, then a
+                /// compilation error is returned.
+                pub fn size_limit(
+                    &mut self,
+                    limit: usize,
+                ) -> &mut RegexSetBuilder {
+                    self.0.size_limit = limit;
+                    self
+                }
+
+                /// Set the approximate size of the cache used by the DFA.
+                ///
+                /// This roughly corresponds to the number of bytes that the DFA will
+                /// use while searching.
+                ///
+                /// Note that this is a *per thread* limit. There is no way to set a global
+                /// limit. In particular, if a regex is used from multiple threads
+                /// simultaneously, then each thread may use up to the number of bytes
+                /// specified here.
+                pub fn dfa_size_limit(
+                    &mut self,
+                    limit: usize,
+                ) -> &mut RegexSetBuilder {
+                    self.0.dfa_size_limit = limit;
+                    self
+                }
+
+                /// Set the nesting limit for this parser.
+                ///
+                /// The nesting limit controls how deep the abstract syntax tree is allowed
+                /// to be. If the AST exceeds the given limit (e.g., with too many nested
+                /// groups), then an error is returned by the parser.
+                ///
+                /// The purpose of this limit is to act as a heuristic to prevent stack
+                /// overflow for consumers that do structural induction on an `Ast` using
+                /// explicit recursion. While this crate never does this (instead using
+                /// constant stack space and moving the call stack to the heap), other
+                /// crates may.
+                ///
+                /// This limit is not checked until the entire Ast is parsed. Therefore,
+                /// if callers want to put a limit on the amount of heap space used, then
+                /// they should impose a limit on the length, in bytes, of the concrete
+                /// pattern string. In particular, this is viable since this parser
+                /// implementation will limit itself to heap space proportional to the
+                /// length of the pattern string.
+                ///
+                /// Note that a nest limit of `0` will return a nest limit error for most
+                /// patterns but not all. For example, a nest limit of `0` permits `a` but
+                /// not `ab`, since `ab` requires a concatenation, which results in a nest
+                /// depth of `1`. In general, a nest limit is not something that manifests
+                /// in an obvious way in the concrete syntax, therefore, it should not be
+                /// used in a granular way.
+                pub fn nest_limit(
+                    &mut self,
+                    limit: u32,
+                ) -> &mut RegexSetBuilder {
+                    self.0.nest_limit = limit;
+                    self
+                }
+            }
+        }
+    };
+}
+
+define_set_builder!(set_bytes, bytes, false);
+define_set_builder!(set_unicode, unicode, true);
commit	e42c505f54ac2e7b2ca7d0197304cac1b4f605e9	[log] [tgz]
author	Chih-Hung Hsieh <chh@google.com>	Thu Apr 16 10:44:21 2020 -0700
committer	Chih-Hung Hsieh <chh@google.com>	Thu Apr 16 10:45:53 2020 -0700
tree	7ca22ebdbdb7d0c9dc5c66d31f564aeaf51cece4
parent	815d0a2344b5321fd47f0cb7dba96e4cb4e84615 [diff] [blame]