Import 'regex-syntax' package vesion 0.6.17 * Add OWNERS Bug: 152884384 Test: make Change-Id: I760a69ffc851039ceaff58980deece308a426aed

commit: 048fc04e7fd0dd2014a94e73fb34f6e59021c170 [log] [tgz]
author: Chih-Hung Hsieh <chh@google.com> Thu Apr 16 10:44:22 2020 -0700
committer: Chih-Hung Hsieh <chh@google.com> Thu Apr 16 10:46:29 2020 -0700
tree: f9b501b392f13f2ca9d86d0a9d377b181f624dfb
parent: 2dd07da5420c40e365646bf756938d8d1abf6d62 [diff] [blame]
diff --git a/src/ast/mod.rs b/src/ast/mod.rs
new file mode 100644
index 0000000..7179f2d
--- /dev/null
+++ b/src/ast/mod.rs

@@ -0,0 +1,1502 @@
+/*!
+Defines an abstract syntax for regular expressions.
+*/
+
+use std::cmp::Ordering;
+use std::error;
+use std::fmt;
+
+pub use ast::visitor::{visit, Visitor};
+
+pub mod parse;
+pub mod print;
+mod visitor;
+
+/// An error that occurred while parsing a regular expression into an abstract
+/// syntax tree.
+///
+/// Note that note all ASTs represents a valid regular expression. For example,
+/// an AST is constructed without error for `\p{Quux}`, but `Quux` is not a
+/// valid Unicode property name. That particular error is reported when
+/// translating an AST to the high-level intermediate representation (`HIR`).
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Error {
+    /// The kind of error.
+    kind: ErrorKind,
+    /// The original pattern that the parser generated the error from. Every
+    /// span in an error is a valid range into this string.
+    pattern: String,
+    /// The span of this error.
+    span: Span,
+}
+
+impl Error {
+    /// Return the type of this error.
+    pub fn kind(&self) -> &ErrorKind {
+        &self.kind
+    }
+
+    /// The original pattern string in which this error occurred.
+    ///
+    /// Every span reported by this error is reported in terms of this string.
+    pub fn pattern(&self) -> &str {
+        &self.pattern
+    }
+
+    /// Return the span at which this error occurred.
+    pub fn span(&self) -> &Span {
+        &self.span
+    }
+
+    /// Return an auxiliary span. This span exists only for some errors that
+    /// benefit from being able to point to two locations in the original
+    /// regular expression. For example, "duplicate" errors will have the
+    /// main error position set to the duplicate occurrence while its
+    /// auxiliary span will be set to the initial occurrence.
+    pub fn auxiliary_span(&self) -> Option<&Span> {
+        use self::ErrorKind::*;
+        match self.kind {
+            FlagDuplicate { ref original } => Some(original),
+            FlagRepeatedNegation { ref original, .. } => Some(original),
+            GroupNameDuplicate { ref original, .. } => Some(original),
+            _ => None,
+        }
+    }
+}
+
+/// The type of an error that occurred while building an AST.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum ErrorKind {
+    /// The capturing group limit was exceeded.
+    ///
+    /// Note that this represents a limit on the total number of capturing
+    /// groups in a regex and not necessarily the number of nested capturing
+    /// groups. That is, the nest limit can be low and it is still possible for
+    /// this error to occur.
+    CaptureLimitExceeded,
+    /// An invalid escape sequence was found in a character class set.
+    ClassEscapeInvalid,
+    /// An invalid character class range was found. An invalid range is any
+    /// range where the start is greater than the end.
+    ClassRangeInvalid,
+    /// An invalid range boundary was found in a character class. Range
+    /// boundaries must be a single literal codepoint, but this error indicates
+    /// that something else was found, such as a nested class.
+    ClassRangeLiteral,
+    /// An opening `[` was found with no corresponding closing `]`.
+    ClassUnclosed,
+    /// Note that this error variant is no longer used. Namely, a decimal
+    /// number can only appear as a repetition quantifier. When the number
+    /// in a repetition quantifier is empty, then it gets its own specialized
+    /// error, `RepetitionCountDecimalEmpty`.
+    DecimalEmpty,
+    /// An invalid decimal number was given where one was expected.
+    DecimalInvalid,
+    /// A bracketed hex literal was empty.
+    EscapeHexEmpty,
+    /// A bracketed hex literal did not correspond to a Unicode scalar value.
+    EscapeHexInvalid,
+    /// An invalid hexadecimal digit was found.
+    EscapeHexInvalidDigit,
+    /// EOF was found before an escape sequence was completed.
+    EscapeUnexpectedEof,
+    /// An unrecognized escape sequence.
+    EscapeUnrecognized,
+    /// A dangling negation was used when setting flags, e.g., `i-`.
+    FlagDanglingNegation,
+    /// A flag was used twice, e.g., `i-i`.
+    FlagDuplicate {
+        /// The position of the original flag. The error position
+        /// points to the duplicate flag.
+        original: Span,
+    },
+    /// The negation operator was used twice, e.g., `-i-s`.
+    FlagRepeatedNegation {
+        /// The position of the original negation operator. The error position
+        /// points to the duplicate negation operator.
+        original: Span,
+    },
+    /// Expected a flag but got EOF, e.g., `(?`.
+    FlagUnexpectedEof,
+    /// Unrecognized flag, e.g., `a`.
+    FlagUnrecognized,
+    /// A duplicate capture name was found.
+    GroupNameDuplicate {
+        /// The position of the initial occurrence of the capture name. The
+        /// error position itself points to the duplicate occurrence.
+        original: Span,
+    },
+    /// A capture group name is empty, e.g., `(?P<>abc)`.
+    GroupNameEmpty,
+    /// An invalid character was seen for a capture group name. This includes
+    /// errors where the first character is a digit (even though subsequent
+    /// characters are allowed to be digits).
+    GroupNameInvalid,
+    /// A closing `>` could not be found for a capture group name.
+    GroupNameUnexpectedEof,
+    /// An unclosed group, e.g., `(ab`.
+    ///
+    /// The span of this error corresponds to the unclosed parenthesis.
+    GroupUnclosed,
+    /// An unopened group, e.g., `ab)`.
+    GroupUnopened,
+    /// The nest limit was exceeded. The limit stored here is the limit
+    /// configured in the parser.
+    NestLimitExceeded(u32),
+    /// The range provided in a counted repetition operator is invalid. The
+    /// range is invalid if the start is greater than the end.
+    RepetitionCountInvalid,
+    /// An opening `{` was not followed by a valid decimal value.
+    /// For example, `x{}` or `x{]}` would fail.
+    RepetitionCountDecimalEmpty,
+    /// An opening `{` was found with no corresponding closing `}`.
+    RepetitionCountUnclosed,
+    /// A repetition operator was applied to a missing sub-expression. This
+    /// occurs, for example, in the regex consisting of just a `*` or even
+    /// `(?i)*`. It is, however, possible to create a repetition operating on
+    /// an empty sub-expression. For example, `()*` is still considered valid.
+    RepetitionMissing,
+    /// The Unicode class is not valid. This typically occurs when a `\p` is
+    /// followed by something other than a `{`.
+    UnicodeClassInvalid,
+    /// When octal support is disabled, this error is produced when an octal
+    /// escape is used. The octal escape is assumed to be an invocation of
+    /// a backreference, which is the common case.
+    UnsupportedBackreference,
+    /// When syntax similar to PCRE's look-around is used, this error is
+    /// returned. Some example syntaxes that are rejected include, but are
+    /// not necessarily limited to, `(?=re)`, `(?!re)`, `(?<=re)` and
+    /// `(?<!re)`. Note that all of these syntaxes are otherwise invalid; this
+    /// error is used to improve the user experience.
+    UnsupportedLookAround,
+    /// Hints that destructuring should not be exhaustive.
+    ///
+    /// This enum may grow additional variants, so this makes sure clients
+    /// don't count on exhaustive matching. (Otherwise, adding a new variant
+    /// could break existing code.)
+    #[doc(hidden)]
+    __Nonexhaustive,
+}
+
+impl error::Error for Error {
+    // TODO: Remove this method entirely on the next breaking semver release.
+    #[allow(deprecated)]
+    fn description(&self) -> &str {
+        use self::ErrorKind::*;
+        match self.kind {
+            CaptureLimitExceeded => "capture group limit exceeded",
+            ClassEscapeInvalid => "invalid escape sequence in character class",
+            ClassRangeInvalid => "invalid character class range",
+            ClassRangeLiteral => "invalid range boundary, must be a literal",
+            ClassUnclosed => "unclosed character class",
+            DecimalEmpty => "empty decimal literal",
+            DecimalInvalid => "invalid decimal literal",
+            EscapeHexEmpty => "empty hexadecimal literal",
+            EscapeHexInvalid => "invalid hexadecimal literal",
+            EscapeHexInvalidDigit => "invalid hexadecimal digit",
+            EscapeUnexpectedEof => "unexpected eof (escape sequence)",
+            EscapeUnrecognized => "unrecognized escape sequence",
+            FlagDanglingNegation => "dangling flag negation operator",
+            FlagDuplicate { .. } => "duplicate flag",
+            FlagRepeatedNegation { .. } => "repeated negation",
+            FlagUnexpectedEof => "unexpected eof (flag)",
+            FlagUnrecognized => "unrecognized flag",
+            GroupNameDuplicate { .. } => "duplicate capture group name",
+            GroupNameEmpty => "empty capture group name",
+            GroupNameInvalid => "invalid capture group name",
+            GroupNameUnexpectedEof => "unclosed capture group name",
+            GroupUnclosed => "unclosed group",
+            GroupUnopened => "unopened group",
+            NestLimitExceeded(_) => "nest limit exceeded",
+            RepetitionCountInvalid => "invalid repetition count range",
+            RepetitionCountUnclosed => "unclosed counted repetition",
+            RepetitionMissing => "repetition operator missing expression",
+            UnicodeClassInvalid => "invalid Unicode character class",
+            UnsupportedBackreference => "backreferences are not supported",
+            UnsupportedLookAround => "look-around is not supported",
+            _ => unreachable!(),
+        }
+    }
+}
+
+impl fmt::Display for Error {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        ::error::Formatter::from(self).fmt(f)
+    }
+}
+
+impl fmt::Display for ErrorKind {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        use self::ErrorKind::*;
+        match *self {
+            CaptureLimitExceeded => write!(
+                f,
+                "exceeded the maximum number of \
+                 capturing groups ({})",
+                ::std::u32::MAX
+            ),
+            ClassEscapeInvalid => {
+                write!(f, "invalid escape sequence found in character class")
+            }
+            ClassRangeInvalid => write!(
+                f,
+                "invalid character class range, \
+                 the start must be <= the end"
+            ),
+            ClassRangeLiteral => {
+                write!(f, "invalid range boundary, must be a literal")
+            }
+            ClassUnclosed => write!(f, "unclosed character class"),
+            DecimalEmpty => write!(f, "decimal literal empty"),
+            DecimalInvalid => write!(f, "decimal literal invalid"),
+            EscapeHexEmpty => write!(f, "hexadecimal literal empty"),
+            EscapeHexInvalid => {
+                write!(f, "hexadecimal literal is not a Unicode scalar value")
+            }
+            EscapeHexInvalidDigit => write!(f, "invalid hexadecimal digit"),
+            EscapeUnexpectedEof => write!(
+                f,
+                "incomplete escape sequence, \
+                 reached end of pattern prematurely"
+            ),
+            EscapeUnrecognized => write!(f, "unrecognized escape sequence"),
+            FlagDanglingNegation => {
+                write!(f, "dangling flag negation operator")
+            }
+            FlagDuplicate { .. } => write!(f, "duplicate flag"),
+            FlagRepeatedNegation { .. } => {
+                write!(f, "flag negation operator repeated")
+            }
+            FlagUnexpectedEof => {
+                write!(f, "expected flag but got end of regex")
+            }
+            FlagUnrecognized => write!(f, "unrecognized flag"),
+            GroupNameDuplicate { .. } => {
+                write!(f, "duplicate capture group name")
+            }
+            GroupNameEmpty => write!(f, "empty capture group name"),
+            GroupNameInvalid => write!(f, "invalid capture group character"),
+            GroupNameUnexpectedEof => write!(f, "unclosed capture group name"),
+            GroupUnclosed => write!(f, "unclosed group"),
+            GroupUnopened => write!(f, "unopened group"),
+            NestLimitExceeded(limit) => write!(
+                f,
+                "exceed the maximum number of \
+                 nested parentheses/brackets ({})",
+                limit
+            ),
+            RepetitionCountInvalid => write!(
+                f,
+                "invalid repetition count range, \
+                 the start must be <= the end"
+            ),
+            RepetitionCountDecimalEmpty => {
+                write!(f, "repetition quantifier expects a valid decimal")
+            }
+            RepetitionCountUnclosed => {
+                write!(f, "unclosed counted repetition")
+            }
+            RepetitionMissing => {
+                write!(f, "repetition operator missing expression")
+            }
+            UnicodeClassInvalid => {
+                write!(f, "invalid Unicode character class")
+            }
+            UnsupportedBackreference => {
+                write!(f, "backreferences are not supported")
+            }
+            UnsupportedLookAround => write!(
+                f,
+                "look-around, including look-ahead and look-behind, \
+                 is not supported"
+            ),
+            _ => unreachable!(),
+        }
+    }
+}
+
+/// Span represents the position information of a single AST item.
+///
+/// All span positions are absolute byte offsets that can be used on the
+/// original regular expression that was parsed.
+#[derive(Clone, Copy, Eq, PartialEq)]
+pub struct Span {
+    /// The start byte offset.
+    pub start: Position,
+    /// The end byte offset.
+    pub end: Position,
+}
+
+impl fmt::Debug for Span {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "Span({:?}, {:?})", self.start, self.end)
+    }
+}
+
+impl Ord for Span {
+    fn cmp(&self, other: &Span) -> Ordering {
+        (&self.start, &self.end).cmp(&(&other.start, &other.end))
+    }
+}
+
+impl PartialOrd for Span {
+    fn partial_cmp(&self, other: &Span) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+/// A single position in a regular expression.
+///
+/// A position encodes one half of a span, and include the byte offset, line
+/// number and column number.
+#[derive(Clone, Copy, Eq, PartialEq)]
+pub struct Position {
+    /// The absolute offset of this position, starting at `0` from the
+    /// beginning of the regular expression pattern string.
+    pub offset: usize,
+    /// The line number, starting at `1`.
+    pub line: usize,
+    /// The approximate column number, starting at `1`.
+    pub column: usize,
+}
+
+impl fmt::Debug for Position {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(
+            f,
+            "Position(o: {:?}, l: {:?}, c: {:?})",
+            self.offset, self.line, self.column
+        )
+    }
+}
+
+impl Ord for Position {
+    fn cmp(&self, other: &Position) -> Ordering {
+        self.offset.cmp(&other.offset)
+    }
+}
+
+impl PartialOrd for Position {
+    fn partial_cmp(&self, other: &Position) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Span {
+    /// Create a new span with the given positions.
+    pub fn new(start: Position, end: Position) -> Span {
+        Span { start: start, end: end }
+    }
+
+    /// Create a new span using the given position as the start and end.
+    pub fn splat(pos: Position) -> Span {
+        Span::new(pos, pos)
+    }
+
+    /// Create a new span by replacing the starting the position with the one
+    /// given.
+    pub fn with_start(self, pos: Position) -> Span {
+        Span { start: pos, ..self }
+    }
+
+    /// Create a new span by replacing the ending the position with the one
+    /// given.
+    pub fn with_end(self, pos: Position) -> Span {
+        Span { end: pos, ..self }
+    }
+
+    /// Returns true if and only if this span occurs on a single line.
+    pub fn is_one_line(&self) -> bool {
+        self.start.line == self.end.line
+    }
+
+    /// Returns true if and only if this span is empty. That is, it points to
+    /// a single position in the concrete syntax of a regular expression.
+    pub fn is_empty(&self) -> bool {
+        self.start.offset == self.end.offset
+    }
+}
+
+impl Position {
+    /// Create a new position with the given information.
+    ///
+    /// `offset` is the absolute offset of the position, starting at `0` from
+    /// the beginning of the regular expression pattern string.
+    ///
+    /// `line` is the line number, starting at `1`.
+    ///
+    /// `column` is the approximate column number, starting at `1`.
+    pub fn new(offset: usize, line: usize, column: usize) -> Position {
+        Position { offset: offset, line: line, column: column }
+    }
+}
+
+/// An abstract syntax tree for a singular expression along with comments
+/// found.
+///
+/// Comments are not stored in the tree itself to avoid complexity. Each
+/// comment contains a span of precisely where it occurred in the original
+/// regular expression.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct WithComments {
+    /// The actual ast.
+    pub ast: Ast,
+    /// All comments found in the original regular expression.
+    pub comments: Vec<Comment>,
+}
+
+/// A comment from a regular expression with an associated span.
+///
+/// A regular expression can only contain comments when the `x` flag is
+/// enabled.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Comment {
+    /// The span of this comment, including the beginning `#` and ending `\n`.
+    pub span: Span,
+    /// The comment text, starting with the first character following the `#`
+    /// and ending with the last character preceding the `\n`.
+    pub comment: String,
+}
+
+/// An abstract syntax tree for a single regular expression.
+///
+/// An `Ast`'s `fmt::Display` implementation uses constant stack space and heap
+/// space proportional to the size of the `Ast`.
+///
+/// This type defines its own destructor that uses constant stack space and
+/// heap space proportional to the size of the `Ast`.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum Ast {
+    /// An empty regex that matches everything.
+    Empty(Span),
+    /// A set of flags, e.g., `(?is)`.
+    Flags(SetFlags),
+    /// A single character literal, which includes escape sequences.
+    Literal(Literal),
+    /// The "any character" class.
+    Dot(Span),
+    /// A single zero-width assertion.
+    Assertion(Assertion),
+    /// A single character class. This includes all forms of character classes
+    /// except for `.`. e.g., `\d`, `\pN`, `[a-z]` and `[[:alpha:]]`.
+    Class(Class),
+    /// A repetition operator applied to an arbitrary regular expression.
+    Repetition(Repetition),
+    /// A grouped regular expression.
+    Group(Group),
+    /// An alternation of regular expressions.
+    Alternation(Alternation),
+    /// A concatenation of regular expressions.
+    Concat(Concat),
+}
+
+impl Ast {
+    /// Return the span of this abstract syntax tree.
+    pub fn span(&self) -> &Span {
+        match *self {
+            Ast::Empty(ref span) => span,
+            Ast::Flags(ref x) => &x.span,
+            Ast::Literal(ref x) => &x.span,
+            Ast::Dot(ref span) => span,
+            Ast::Assertion(ref x) => &x.span,
+            Ast::Class(ref x) => x.span(),
+            Ast::Repetition(ref x) => &x.span,
+            Ast::Group(ref x) => &x.span,
+            Ast::Alternation(ref x) => &x.span,
+            Ast::Concat(ref x) => &x.span,
+        }
+    }
+
+    /// Return true if and only if this Ast is empty.
+    pub fn is_empty(&self) -> bool {
+        match *self {
+            Ast::Empty(_) => true,
+            _ => false,
+        }
+    }
+
+    /// Returns true if and only if this AST has any (including possibly empty)
+    /// subexpressions.
+    fn has_subexprs(&self) -> bool {
+        match *self {
+            Ast::Empty(_)
+            | Ast::Flags(_)
+            | Ast::Literal(_)
+            | Ast::Dot(_)
+            | Ast::Assertion(_) => false,
+            Ast::Class(_)
+            | Ast::Repetition(_)
+            | Ast::Group(_)
+            | Ast::Alternation(_)
+            | Ast::Concat(_) => true,
+        }
+    }
+}
+
+/// Print a display representation of this Ast.
+///
+/// This does not preserve any of the original whitespace formatting that may
+/// have originally been present in the concrete syntax from which this Ast
+/// was generated.
+///
+/// This implementation uses constant stack space and heap space proportional
+/// to the size of the `Ast`.
+impl fmt::Display for Ast {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        use ast::print::Printer;
+        Printer::new().print(self, f)
+    }
+}
+
+/// An alternation of regular expressions.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Alternation {
+    /// The span of this alternation.
+    pub span: Span,
+    /// The alternate regular expressions.
+    pub asts: Vec<Ast>,
+}
+
+impl Alternation {
+    /// Return this alternation as an AST.
+    ///
+    /// If this alternation contains zero ASTs, then Ast::Empty is
+    /// returned. If this alternation contains exactly 1 AST, then the
+    /// corresponding AST is returned. Otherwise, Ast::Alternation is returned.
+    pub fn into_ast(mut self) -> Ast {
+        match self.asts.len() {
+            0 => Ast::Empty(self.span),
+            1 => self.asts.pop().unwrap(),
+            _ => Ast::Alternation(self),
+        }
+    }
+}
+
+/// A concatenation of regular expressions.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Concat {
+    /// The span of this concatenation.
+    pub span: Span,
+    /// The concatenation regular expressions.
+    pub asts: Vec<Ast>,
+}
+
+impl Concat {
+    /// Return this concatenation as an AST.
+    ///
+    /// If this concatenation contains zero ASTs, then Ast::Empty is
+    /// returned. If this concatenation contains exactly 1 AST, then the
+    /// corresponding AST is returned. Otherwise, Ast::Concat is returned.
+    pub fn into_ast(mut self) -> Ast {
+        match self.asts.len() {
+            0 => Ast::Empty(self.span),
+            1 => self.asts.pop().unwrap(),
+            _ => Ast::Concat(self),
+        }
+    }
+}
+
+/// A single literal expression.
+///
+/// A literal corresponds to a single Unicode scalar value. Literals may be
+/// represented in their literal form, e.g., `a` or in their escaped form,
+/// e.g., `\x61`.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Literal {
+    /// The span of this literal.
+    pub span: Span,
+    /// The kind of this literal.
+    pub kind: LiteralKind,
+    /// The Unicode scalar value corresponding to this literal.
+    pub c: char,
+}
+
+impl Literal {
+    /// If this literal was written as a `\x` hex escape, then this returns
+    /// the corresponding byte value. Otherwise, this returns `None`.
+    pub fn byte(&self) -> Option<u8> {
+        let short_hex = LiteralKind::HexFixed(HexLiteralKind::X);
+        if self.c as u32 <= 255 && self.kind == short_hex {
+            Some(self.c as u8)
+        } else {
+            None
+        }
+    }
+}
+
+/// The kind of a single literal expression.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum LiteralKind {
+    /// The literal is written verbatim, e.g., `a` or `☃`.
+    Verbatim,
+    /// The literal is written as an escape because it is punctuation, e.g.,
+    /// `\*` or `\[`.
+    Punctuation,
+    /// The literal is written as an octal escape, e.g., `\141`.
+    Octal,
+    /// The literal is written as a hex code with a fixed number of digits
+    /// depending on the type of the escape, e.g., `\x61` or or `\u0061` or
+    /// `\U00000061`.
+    HexFixed(HexLiteralKind),
+    /// The literal is written as a hex code with a bracketed number of
+    /// digits. The only restriction is that the bracketed hex code must refer
+    /// to a valid Unicode scalar value.
+    HexBrace(HexLiteralKind),
+    /// The literal is written as a specially recognized escape, e.g., `\f`
+    /// or `\n`.
+    Special(SpecialLiteralKind),
+}
+
+/// The type of a special literal.
+///
+/// A special literal is a special escape sequence recognized by the regex
+/// parser, e.g., `\f` or `\n`.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum SpecialLiteralKind {
+    /// Bell, spelled `\a` (`\x07`).
+    Bell,
+    /// Form feed, spelled `\f` (`\x0C`).
+    FormFeed,
+    /// Tab, spelled `\t` (`\x09`).
+    Tab,
+    /// Line feed, spelled `\n` (`\x0A`).
+    LineFeed,
+    /// Carriage return, spelled `\r` (`\x0D`).
+    CarriageReturn,
+    /// Vertical tab, spelled `\v` (`\x0B`).
+    VerticalTab,
+    /// Space, spelled `\ ` (`\x20`). Note that this can only appear when
+    /// parsing in verbose mode.
+    Space,
+}
+
+/// The type of a Unicode hex literal.
+///
+/// Note that all variants behave the same when used with brackets. They only
+/// differ when used without brackets in the number of hex digits that must
+/// follow.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum HexLiteralKind {
+    /// A `\x` prefix. When used without brackets, this form is limited to
+    /// two digits.
+    X,
+    /// A `\u` prefix. When used without brackets, this form is limited to
+    /// four digits.
+    UnicodeShort,
+    /// A `\U` prefix. When used without brackets, this form is limited to
+    /// eight digits.
+    UnicodeLong,
+}
+
+impl HexLiteralKind {
+    /// The number of digits that must be used with this literal form when
+    /// used without brackets. When used with brackets, there is no
+    /// restriction on the number of digits.
+    pub fn digits(&self) -> u32 {
+        match *self {
+            HexLiteralKind::X => 2,
+            HexLiteralKind::UnicodeShort => 4,
+            HexLiteralKind::UnicodeLong => 8,
+        }
+    }
+}
+
+/// A single character class expression.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum Class {
+    /// A Unicode character class, e.g., `\pL` or `\p{Greek}`.
+    Unicode(ClassUnicode),
+    /// A perl character class, e.g., `\d` or `\W`.
+    Perl(ClassPerl),
+    /// A bracketed character class set, which may contain zero or more
+    /// character ranges and/or zero or more nested classes. e.g.,
+    /// `[a-zA-Z\pL]`.
+    Bracketed(ClassBracketed),
+}
+
+impl Class {
+    /// Return the span of this character class.
+    pub fn span(&self) -> &Span {
+        match *self {
+            Class::Perl(ref x) => &x.span,
+            Class::Unicode(ref x) => &x.span,
+            Class::Bracketed(ref x) => &x.span,
+        }
+    }
+}
+
+/// A Perl character class.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct ClassPerl {
+    /// The span of this class.
+    pub span: Span,
+    /// The kind of Perl class.
+    pub kind: ClassPerlKind,
+    /// Whether the class is negated or not. e.g., `\d` is not negated but
+    /// `\D` is.
+    pub negated: bool,
+}
+
+/// The available Perl character classes.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum ClassPerlKind {
+    /// Decimal numbers.
+    Digit,
+    /// Whitespace.
+    Space,
+    /// Word characters.
+    Word,
+}
+
+/// An ASCII character class.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct ClassAscii {
+    /// The span of this class.
+    pub span: Span,
+    /// The kind of ASCII class.
+    pub kind: ClassAsciiKind,
+    /// Whether the class is negated or not. e.g., `[[:alpha:]]` is not negated
+    /// but `[[:^alpha:]]` is.
+    pub negated: bool,
+}
+
+/// The available ASCII character classes.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum ClassAsciiKind {
+    /// `[0-9A-Za-z]`
+    Alnum,
+    /// `[A-Za-z]`
+    Alpha,
+    /// `[\x00-\x7F]`
+    Ascii,
+    /// `[ \t]`
+    Blank,
+    /// `[\x00-\x1F\x7F]`
+    Cntrl,
+    /// `[0-9]`
+    Digit,
+    /// `[!-~]`
+    Graph,
+    /// `[a-z]`
+    Lower,
+    /// `[ -~]`
+    Print,
+    /// `[!-/:-@\[-`{-~]`
+    Punct,
+    /// `[\t\n\v\f\r ]`
+    Space,
+    /// `[A-Z]`
+    Upper,
+    /// `[0-9A-Za-z_]`
+    Word,
+    /// `[0-9A-Fa-f]`
+    Xdigit,
+}
+
+impl ClassAsciiKind {
+    /// Return the corresponding ClassAsciiKind variant for the given name.
+    ///
+    /// The name given should correspond to the lowercase version of the
+    /// variant name. e.g., `cntrl` is the name for `ClassAsciiKind::Cntrl`.
+    ///
+    /// If no variant with the corresponding name exists, then `None` is
+    /// returned.
+    pub fn from_name(name: &str) -> Option<ClassAsciiKind> {
+        use self::ClassAsciiKind::*;
+        match name {
+            "alnum" => Some(Alnum),
+            "alpha" => Some(Alpha),
+            "ascii" => Some(Ascii),
+            "blank" => Some(Blank),
+            "cntrl" => Some(Cntrl),
+            "digit" => Some(Digit),
+            "graph" => Some(Graph),
+            "lower" => Some(Lower),
+            "print" => Some(Print),
+            "punct" => Some(Punct),
+            "space" => Some(Space),
+            "upper" => Some(Upper),
+            "word" => Some(Word),
+            "xdigit" => Some(Xdigit),
+            _ => None,
+        }
+    }
+}
+
+/// A Unicode character class.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct ClassUnicode {
+    /// The span of this class.
+    pub span: Span,
+    /// Whether this class is negated or not.
+    ///
+    /// Note: be careful when using this attribute. This specifically refers
+    /// to whether the class is written as `\p` or `\P`, where the latter
+    /// is `negated = true`. However, it also possible to write something like
+    /// `\P{scx!=Katakana}` which is actually equivalent to
+    /// `\p{scx=Katakana}` and is therefore not actually negated even though
+    /// `negated = true` here. To test whether this class is truly negated
+    /// or not, use the `is_negated` method.
+    pub negated: bool,
+    /// The kind of Unicode class.
+    pub kind: ClassUnicodeKind,
+}
+
+impl ClassUnicode {
+    /// Returns true if this class has been negated.
+    ///
+    /// Note that this takes the Unicode op into account, if it's present.
+    /// e.g., `is_negated` for `\P{scx!=Katakana}` will return `false`.
+    pub fn is_negated(&self) -> bool {
+        match self.kind {
+            ClassUnicodeKind::NamedValue {
+                op: ClassUnicodeOpKind::NotEqual,
+                ..
+            } => !self.negated,
+            _ => self.negated,
+        }
+    }
+}
+
+/// The available forms of Unicode character classes.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum ClassUnicodeKind {
+    /// A one letter abbreviated class, e.g., `\pN`.
+    OneLetter(char),
+    /// A binary property, general category or script. The string may be
+    /// empty.
+    Named(String),
+    /// A property name and an associated value.
+    NamedValue {
+        /// The type of Unicode op used to associate `name` with `value`.
+        op: ClassUnicodeOpKind,
+        /// The property name (which may be empty).
+        name: String,
+        /// The property value (which may be empty).
+        value: String,
+    },
+}
+
+/// The type of op used in a Unicode character class.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum ClassUnicodeOpKind {
+    /// A property set to a specific value, e.g., `\p{scx=Katakana}`.
+    Equal,
+    /// A property set to a specific value using a colon, e.g.,
+    /// `\p{scx:Katakana}`.
+    Colon,
+    /// A property that isn't a particular value, e.g., `\p{scx!=Katakana}`.
+    NotEqual,
+}
+
+impl ClassUnicodeOpKind {
+    /// Whether the op is an equality op or not.
+    pub fn is_equal(&self) -> bool {
+        match *self {
+            ClassUnicodeOpKind::Equal | ClassUnicodeOpKind::Colon => true,
+            _ => false,
+        }
+    }
+}
+
+/// A bracketed character class, e.g., `[a-z0-9]`.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct ClassBracketed {
+    /// The span of this class.
+    pub span: Span,
+    /// Whether this class is negated or not. e.g., `[a]` is not negated but
+    /// `[^a]` is.
+    pub negated: bool,
+    /// The type of this set. A set is either a normal union of things, e.g.,
+    /// `[abc]` or a result of applying set operations, e.g., `[\pL--c]`.
+    pub kind: ClassSet,
+}
+
+/// A character class set.
+///
+/// This type corresponds to the internal structure of a bracketed character
+/// class. That is, every bracketed character is one of two types: a union of
+/// items (literals, ranges, other bracketed classes) or a tree of binary set
+/// operations.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum ClassSet {
+    /// An item, which can be a single literal, range, nested character class
+    /// or a union of items.
+    Item(ClassSetItem),
+    /// A single binary operation (i.e., &&, -- or ~~).
+    BinaryOp(ClassSetBinaryOp),
+}
+
+impl ClassSet {
+    /// Build a set from a union.
+    pub fn union(ast: ClassSetUnion) -> ClassSet {
+        ClassSet::Item(ClassSetItem::Union(ast))
+    }
+
+    /// Return the span of this character class set.
+    pub fn span(&self) -> &Span {
+        match *self {
+            ClassSet::Item(ref x) => x.span(),
+            ClassSet::BinaryOp(ref x) => &x.span,
+        }
+    }
+
+    /// Return true if and only if this class set is empty.
+    fn is_empty(&self) -> bool {
+        match *self {
+            ClassSet::Item(ClassSetItem::Empty(_)) => true,
+            _ => false,
+        }
+    }
+}
+
+/// A single component of a character class set.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum ClassSetItem {
+    /// An empty item.
+    ///
+    /// Note that a bracketed character class cannot contain a single empty
+    /// item. Empty items can appear when using one of the binary operators.
+    /// For example, `[&&]` is the intersection of two empty classes.
+    Empty(Span),
+    /// A single literal.
+    Literal(Literal),
+    /// A range between two literals.
+    Range(ClassSetRange),
+    /// An ASCII character class, e.g., `[:alnum:]` or `[:punct:]`.
+    Ascii(ClassAscii),
+    /// A Unicode character class, e.g., `\pL` or `\p{Greek}`.
+    Unicode(ClassUnicode),
+    /// A perl character class, e.g., `\d` or `\W`.
+    Perl(ClassPerl),
+    /// A bracketed character class set, which may contain zero or more
+    /// character ranges and/or zero or more nested classes. e.g.,
+    /// `[a-zA-Z\pL]`.
+    Bracketed(Box<ClassBracketed>),
+    /// A union of items.
+    Union(ClassSetUnion),
+}
+
+impl ClassSetItem {
+    /// Return the span of this character class set item.
+    pub fn span(&self) -> &Span {
+        match *self {
+            ClassSetItem::Empty(ref span) => span,
+            ClassSetItem::Literal(ref x) => &x.span,
+            ClassSetItem::Range(ref x) => &x.span,
+            ClassSetItem::Ascii(ref x) => &x.span,
+            ClassSetItem::Perl(ref x) => &x.span,
+            ClassSetItem::Unicode(ref x) => &x.span,
+            ClassSetItem::Bracketed(ref x) => &x.span,
+            ClassSetItem::Union(ref x) => &x.span,
+        }
+    }
+}
+
+/// A single character class range in a set.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct ClassSetRange {
+    /// The span of this range.
+    pub span: Span,
+    /// The start of this range.
+    pub start: Literal,
+    /// The end of this range.
+    pub end: Literal,
+}
+
+impl ClassSetRange {
+    /// Returns true if and only if this character class range is valid.
+    ///
+    /// The only case where a range is invalid is if its start is greater than
+    /// its end.
+    pub fn is_valid(&self) -> bool {
+        self.start.c <= self.end.c
+    }
+}
+
+/// A union of items inside a character class set.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct ClassSetUnion {
+    /// The span of the items in this operation. e.g., the `a-z0-9` in
+    /// `[^a-z0-9]`
+    pub span: Span,
+    /// The sequence of items that make up this union.
+    pub items: Vec<ClassSetItem>,
+}
+
+impl ClassSetUnion {
+    /// Push a new item in this union.
+    ///
+    /// The ending position of this union's span is updated to the ending
+    /// position of the span of the item given. If the union is empty, then
+    /// the starting position of this union is set to the starting position
+    /// of this item.
+    ///
+    /// In other words, if you only use this method to add items to a union
+    /// and you set the spans on each item correctly, then you should never
+    /// need to adjust the span of the union directly.
+    pub fn push(&mut self, item: ClassSetItem) {
+        if self.items.is_empty() {
+            self.span.start = item.span().start;
+        }
+        self.span.end = item.span().end;
+        self.items.push(item);
+    }
+
+    /// Return this union as a character class set item.
+    ///
+    /// If this union contains zero items, then an empty union is
+    /// returned. If this concatenation contains exactly 1 item, then the
+    /// corresponding item is returned. Otherwise, ClassSetItem::Union is
+    /// returned.
+    pub fn into_item(mut self) -> ClassSetItem {
+        match self.items.len() {
+            0 => ClassSetItem::Empty(self.span),
+            1 => self.items.pop().unwrap(),
+            _ => ClassSetItem::Union(self),
+        }
+    }
+}
+
+/// A Unicode character class set operation.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct ClassSetBinaryOp {
+    /// The span of this operation. e.g., the `a-z--[h-p]` in `[a-z--h-p]`.
+    pub span: Span,
+    /// The type of this set operation.
+    pub kind: ClassSetBinaryOpKind,
+    /// The left hand side of the operation.
+    pub lhs: Box<ClassSet>,
+    /// The right hand side of the operation.
+    pub rhs: Box<ClassSet>,
+}
+
+/// The type of a Unicode character class set operation.
+///
+/// Note that this doesn't explicitly represent union since there is no
+/// explicit union operator. Concatenation inside a character class corresponds
+/// to the union operation.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum ClassSetBinaryOpKind {
+    /// The intersection of two sets, e.g., `\pN&&[a-z]`.
+    Intersection,
+    /// The difference of two sets, e.g., `\pN--[0-9]`.
+    Difference,
+    /// The symmetric difference of two sets. The symmetric difference is the
+    /// set of elements belonging to one but not both sets.
+    /// e.g., `[\pL~~[:ascii:]]`.
+    SymmetricDifference,
+}
+
+/// A single zero-width assertion.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Assertion {
+    /// The span of this assertion.
+    pub span: Span,
+    /// The assertion kind, e.g., `\b` or `^`.
+    pub kind: AssertionKind,
+}
+
+/// An assertion kind.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum AssertionKind {
+    /// `^`
+    StartLine,
+    /// `$`
+    EndLine,
+    /// `\A`
+    StartText,
+    /// `\z`
+    EndText,
+    /// `\b`
+    WordBoundary,
+    /// `\B`
+    NotWordBoundary,
+}
+
+/// A repetition operation applied to a regular expression.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Repetition {
+    /// The span of this operation.
+    pub span: Span,
+    /// The actual operation.
+    pub op: RepetitionOp,
+    /// Whether this operation was applied greedily or not.
+    pub greedy: bool,
+    /// The regular expression under repetition.
+    pub ast: Box<Ast>,
+}
+
+/// The repetition operator itself.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct RepetitionOp {
+    /// The span of this operator. This includes things like `+`, `*?` and
+    /// `{m,n}`.
+    pub span: Span,
+    /// The type of operation.
+    pub kind: RepetitionKind,
+}
+
+/// The kind of a repetition operator.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum RepetitionKind {
+    /// `?`
+    ZeroOrOne,
+    /// `*`
+    ZeroOrMore,
+    /// `+`
+    OneOrMore,
+    /// `{m,n}`
+    Range(RepetitionRange),
+}
+
+/// A range repetition operator.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum RepetitionRange {
+    /// `{m}`
+    Exactly(u32),
+    /// `{m,}`
+    AtLeast(u32),
+    /// `{m,n}`
+    Bounded(u32, u32),
+}
+
+impl RepetitionRange {
+    /// Returns true if and only if this repetition range is valid.
+    ///
+    /// The only case where a repetition range is invalid is if it is bounded
+    /// and its start is greater than its end.
+    pub fn is_valid(&self) -> bool {
+        match *self {
+            RepetitionRange::Bounded(s, e) if s > e => false,
+            _ => true,
+        }
+    }
+}
+
+/// A grouped regular expression.
+///
+/// This includes both capturing and non-capturing groups. This does **not**
+/// include flag-only groups like `(?is)`, but does contain any group that
+/// contains a sub-expression, e.g., `(a)`, `(?P<name>a)`, `(?:a)` and
+/// `(?is:a)`.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Group {
+    /// The span of this group.
+    pub span: Span,
+    /// The kind of this group.
+    pub kind: GroupKind,
+    /// The regular expression in this group.
+    pub ast: Box<Ast>,
+}
+
+impl Group {
+    /// If this group is non-capturing, then this returns the (possibly empty)
+    /// set of flags. Otherwise, `None` is returned.
+    pub fn flags(&self) -> Option<&Flags> {
+        match self.kind {
+            GroupKind::NonCapturing(ref flags) => Some(flags),
+            _ => None,
+        }
+    }
+
+    /// Returns true if and only if this group is capturing.
+    pub fn is_capturing(&self) -> bool {
+        match self.kind {
+            GroupKind::CaptureIndex(_) | GroupKind::CaptureName(_) => true,
+            GroupKind::NonCapturing(_) => false,
+        }
+    }
+
+    /// Returns the capture index of this group, if this is a capturing group.
+    ///
+    /// This returns a capture index precisely when `is_capturing` is `true`.
+    pub fn capture_index(&self) -> Option<u32> {
+        match self.kind {
+            GroupKind::CaptureIndex(i) => Some(i),
+            GroupKind::CaptureName(ref x) => Some(x.index),
+            GroupKind::NonCapturing(_) => None,
+        }
+    }
+}
+
+/// The kind of a group.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum GroupKind {
+    /// `(a)`
+    CaptureIndex(u32),
+    /// `(?P<name>a)`
+    CaptureName(CaptureName),
+    /// `(?:a)` and `(?i:a)`
+    NonCapturing(Flags),
+}
+
+/// A capture name.
+///
+/// This corresponds to the name itself between the angle brackets in, e.g.,
+/// `(?P<foo>expr)`.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct CaptureName {
+    /// The span of this capture name.
+    pub span: Span,
+    /// The capture name.
+    pub name: String,
+    /// The capture index.
+    pub index: u32,
+}
+
+/// A group of flags that is not applied to a particular regular expression.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct SetFlags {
+    /// The span of these flags, including the grouping parentheses.
+    pub span: Span,
+    /// The actual sequence of flags.
+    pub flags: Flags,
+}
+
+/// A group of flags.
+///
+/// This corresponds only to the sequence of flags themselves, e.g., `is-u`.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Flags {
+    /// The span of this group of flags.
+    pub span: Span,
+    /// A sequence of flag items. Each item is either a flag or a negation
+    /// operator.
+    pub items: Vec<FlagsItem>,
+}
+
+impl Flags {
+    /// Add the given item to this sequence of flags.
+    ///
+    /// If the item was added successfully, then `None` is returned. If the
+    /// given item is a duplicate, then `Some(i)` is returned, where
+    /// `items[i].kind == item.kind`.
+    pub fn add_item(&mut self, item: FlagsItem) -> Option<usize> {
+        for (i, x) in self.items.iter().enumerate() {
+            if x.kind == item.kind {
+                return Some(i);
+            }
+        }
+        self.items.push(item);
+        None
+    }
+
+    /// Returns the state of the given flag in this set.
+    ///
+    /// If the given flag is in the set but is negated, then `Some(false)` is
+    /// returned.
+    ///
+    /// If the given flag is in the set and is not negated, then `Some(true)`
+    /// is returned.
+    ///
+    /// Otherwise, `None` is returned.
+    pub fn flag_state(&self, flag: Flag) -> Option<bool> {
+        let mut negated = false;
+        for x in &self.items {
+            match x.kind {
+                FlagsItemKind::Negation => {
+                    negated = true;
+                }
+                FlagsItemKind::Flag(ref xflag) if xflag == &flag => {
+                    return Some(!negated);
+                }
+                _ => {}
+            }
+        }
+        None
+    }
+}
+
+/// A single item in a group of flags.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct FlagsItem {
+    /// The span of this item.
+    pub span: Span,
+    /// The kind of this item.
+    pub kind: FlagsItemKind,
+}
+
+/// The kind of an item in a group of flags.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum FlagsItemKind {
+    /// A negation operator applied to all subsequent flags in the enclosing
+    /// group.
+    Negation,
+    /// A single flag in a group.
+    Flag(Flag),
+}
+
+impl FlagsItemKind {
+    /// Returns true if and only if this item is a negation operator.
+    pub fn is_negation(&self) -> bool {
+        match *self {
+            FlagsItemKind::Negation => true,
+            _ => false,
+        }
+    }
+}
+
+/// A single flag.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum Flag {
+    /// `i`
+    CaseInsensitive,
+    /// `m`
+    MultiLine,
+    /// `s`
+    DotMatchesNewLine,
+    /// `U`
+    SwapGreed,
+    /// `u`
+    Unicode,
+    /// `x`
+    IgnoreWhitespace,
+}
+
+/// A custom `Drop` impl is used for `Ast` such that it uses constant stack
+/// space but heap space proportional to the depth of the `Ast`.
+impl Drop for Ast {
+    fn drop(&mut self) {
+        use std::mem;
+
+        match *self {
+            Ast::Empty(_)
+            | Ast::Flags(_)
+            | Ast::Literal(_)
+            | Ast::Dot(_)
+            | Ast::Assertion(_)
+            // Classes are recursive, so they get their own Drop impl.
+            | Ast::Class(_) => return,
+            Ast::Repetition(ref x) if !x.ast.has_subexprs() => return,
+            Ast::Group(ref x) if !x.ast.has_subexprs() => return,
+            Ast::Alternation(ref x) if x.asts.is_empty() => return,
+            Ast::Concat(ref x) if x.asts.is_empty() => return,
+            _ => {}
+        }
+
+        let empty_span = || Span::splat(Position::new(0, 0, 0));
+        let empty_ast = || Ast::Empty(empty_span());
+        let mut stack = vec![mem::replace(self, empty_ast())];
+        while let Some(mut ast) = stack.pop() {
+            match ast {
+                Ast::Empty(_)
+                | Ast::Flags(_)
+                | Ast::Literal(_)
+                | Ast::Dot(_)
+                | Ast::Assertion(_)
+                // Classes are recursive, so they get their own Drop impl.
+                | Ast::Class(_) => {}
+                Ast::Repetition(ref mut x) => {
+                    stack.push(mem::replace(&mut x.ast, empty_ast()));
+                }
+                Ast::Group(ref mut x) => {
+                    stack.push(mem::replace(&mut x.ast, empty_ast()));
+                }
+                Ast::Alternation(ref mut x) => {
+                    stack.extend(x.asts.drain(..));
+                }
+                Ast::Concat(ref mut x) => {
+                    stack.extend(x.asts.drain(..));
+                }
+            }
+        }
+    }
+}
+
+/// A custom `Drop` impl is used for `ClassSet` such that it uses constant
+/// stack space but heap space proportional to the depth of the `ClassSet`.
+impl Drop for ClassSet {
+    fn drop(&mut self) {
+        use std::mem;
+
+        match *self {
+            ClassSet::Item(ref item) => match *item {
+                ClassSetItem::Empty(_)
+                | ClassSetItem::Literal(_)
+                | ClassSetItem::Range(_)
+                | ClassSetItem::Ascii(_)
+                | ClassSetItem::Unicode(_)
+                | ClassSetItem::Perl(_) => return,
+                ClassSetItem::Bracketed(ref x) => {
+                    if x.kind.is_empty() {
+                        return;
+                    }
+                }
+                ClassSetItem::Union(ref x) => {
+                    if x.items.is_empty() {
+                        return;
+                    }
+                }
+            },
+            ClassSet::BinaryOp(ref op) => {
+                if op.lhs.is_empty() && op.rhs.is_empty() {
+                    return;
+                }
+            }
+        }
+
+        let empty_span = || Span::splat(Position::new(0, 0, 0));
+        let empty_set = || ClassSet::Item(ClassSetItem::Empty(empty_span()));
+        let mut stack = vec![mem::replace(self, empty_set())];
+        while let Some(mut set) = stack.pop() {
+            match set {
+                ClassSet::Item(ref mut item) => match *item {
+                    ClassSetItem::Empty(_)
+                    | ClassSetItem::Literal(_)
+                    | ClassSetItem::Range(_)
+                    | ClassSetItem::Ascii(_)
+                    | ClassSetItem::Unicode(_)
+                    | ClassSetItem::Perl(_) => {}
+                    ClassSetItem::Bracketed(ref mut x) => {
+                        stack.push(mem::replace(&mut x.kind, empty_set()));
+                    }
+                    ClassSetItem::Union(ref mut x) => {
+                        stack.extend(x.items.drain(..).map(ClassSet::Item));
+                    }
+                },
+                ClassSet::BinaryOp(ref mut op) => {
+                    stack.push(mem::replace(&mut op.lhs, empty_set()));
+                    stack.push(mem::replace(&mut op.rhs, empty_set()));
+                }
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // We use a thread with an explicit stack size to test that our destructor
+    // for Ast can handle arbitrarily sized expressions in constant stack
+    // space. In case we run on a platform without threads (WASM?), we limit
+    // this test to Windows/Unix.
+    #[test]
+    #[cfg(any(unix, windows))]
+    fn no_stack_overflow_on_drop() {
+        use std::thread;
+
+        let run = || {
+            let span = || Span::splat(Position::new(0, 0, 0));
+            let mut ast = Ast::Empty(span());
+            for i in 0..200 {
+                ast = Ast::Group(Group {
+                    span: span(),
+                    kind: GroupKind::CaptureIndex(i),
+                    ast: Box::new(ast),
+                });
+            }
+            assert!(!ast.is_empty());
+        };
+
+        // We run our test on a thread with a small stack size so we can
+        // force the issue more easily.
+        thread::Builder::new()
+            .stack_size(1 << 10)
+            .spawn(run)
+            .unwrap()
+            .join()
+            .unwrap();
+    }
+}
commit	048fc04e7fd0dd2014a94e73fb34f6e59021c170	[log] [tgz]
author	Chih-Hung Hsieh <chh@google.com>	Thu Apr 16 10:44:22 2020 -0700
committer	Chih-Hung Hsieh <chh@google.com>	Thu Apr 16 10:46:29 2020 -0700
tree	f9b501b392f13f2ca9d86d0a9d377b181f624dfb
parent	2dd07da5420c40e365646bf756938d8d1abf6d62 [diff] [blame]