Ian Hodson | 2ee91b4 | 2012-05-14 12:29:36 +0100 | [diff] [blame] | 1 | // Copyright 2006 The RE2 Authors. All Rights Reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | // --- SPONSORED LINK -------------------------------------------------- |
| 6 | // If you want to use this library for regular expression matching, |
| 7 | // you should use re2/re2.h, which provides a class RE2 that |
| 8 | // mimics the PCRE interface provided by PCRE's C++ wrappers. |
| 9 | // This header describes the low-level interface used to implement RE2 |
| 10 | // and may change in backwards-incompatible ways from time to time. |
| 11 | // In contrast, RE2's interface will not. |
| 12 | // --------------------------------------------------------------------- |
| 13 | |
| 14 | // Regular expression library: parsing, execution, and manipulation |
| 15 | // of regular expressions. |
| 16 | // |
| 17 | // Any operation that traverses the Regexp structures should be written |
| 18 | // using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested |
| 19 | // regular expressions such as x++++++++++++++++++++... might cause recursive |
| 20 | // traversals to overflow the stack. |
| 21 | // |
| 22 | // It is the caller's responsibility to provide appropriate mutual exclusion |
| 23 | // around manipulation of the regexps. RE2 does this. |
| 24 | // |
| 25 | // PARSING |
| 26 | // |
| 27 | // Regexp::Parse parses regular expressions encoded in UTF-8. |
| 28 | // The default syntax is POSIX extended regular expressions, |
| 29 | // with the following changes: |
| 30 | // |
| 31 | // 1. Backreferences (optional in POSIX EREs) are not supported. |
| 32 | // (Supporting them precludes the use of DFA-based |
| 33 | // matching engines.) |
| 34 | // |
| 35 | // 2. Collating elements and collation classes are not supported. |
| 36 | // (No one has needed or wanted them.) |
| 37 | // |
| 38 | // The exact syntax accepted can be modified by passing flags to |
| 39 | // Regexp::Parse. In particular, many of the basic Perl additions |
| 40 | // are available. The flags are documented below (search for LikePerl). |
| 41 | // |
| 42 | // If parsed with the flag Regexp::Latin1, both the regular expression |
| 43 | // and the input to the matching routines are assumed to be encoded in |
| 44 | // Latin-1, not UTF-8. |
| 45 | // |
| 46 | // EXECUTION |
| 47 | // |
| 48 | // Once Regexp has parsed a regular expression, it provides methods |
| 49 | // to search text using that regular expression. These methods are |
| 50 | // implemented via calling out to other regular expression libraries. |
| 51 | // (Let's call them the sublibraries.) |
| 52 | // |
| 53 | // To call a sublibrary, Regexp does not simply prepare a |
| 54 | // string version of the regular expression and hand it to the |
| 55 | // sublibrary. Instead, Regexp prepares, from its own parsed form, the |
| 56 | // corresponding internal representation used by the sublibrary. |
| 57 | // This has the drawback of needing to know the internal representation |
| 58 | // used by the sublibrary, but it has two important benefits: |
| 59 | // |
| 60 | // 1. The syntax and meaning of regular expressions is guaranteed |
| 61 | // to be that used by Regexp's parser, not the syntax expected |
| 62 | // by the sublibrary. Regexp might accept a restricted or |
| 63 | // expanded syntax for regular expressions as compared with |
| 64 | // the sublibrary. As long as Regexp can translate from its |
| 65 | // internal form into the sublibrary's, clients need not know |
| 66 | // exactly which sublibrary they are using. |
| 67 | // |
| 68 | // 2. The sublibrary parsers are bypassed. For whatever reason, |
| 69 | // sublibrary regular expression parsers often have security |
| 70 | // problems. For example, plan9grep's regular expression parser |
| 71 | // has a buffer overflow in its handling of large character |
| 72 | // classes, and PCRE's parser has had buffer overflow problems |
| 73 | // in the past. Security-team requires sandboxing of sublibrary |
| 74 | // regular expression parsers. Avoiding the sublibrary parsers |
| 75 | // avoids the sandbox. |
| 76 | // |
| 77 | // The execution methods we use now are provided by the compiled form, |
| 78 | // Prog, described in prog.h |
| 79 | // |
| 80 | // MANIPULATION |
| 81 | // |
| 82 | // Unlike other regular expression libraries, Regexp makes its parsed |
| 83 | // form accessible to clients, so that client code can analyze the |
| 84 | // parsed regular expressions. |
| 85 | |
| 86 | #ifndef RE2_REGEXP_H__ |
| 87 | #define RE2_REGEXP_H__ |
| 88 | |
| 89 | #include "util/util.h" |
| 90 | #include "re2/stringpiece.h" |
| 91 | |
| 92 | namespace re2 { |
| 93 | |
| 94 | // Keep in sync with string list kOpcodeNames[] in testing/dump.cc |
| 95 | enum RegexpOp { |
| 96 | // Matches no strings. |
| 97 | kRegexpNoMatch = 1, |
| 98 | |
| 99 | // Matches empty string. |
| 100 | kRegexpEmptyMatch, |
| 101 | |
| 102 | // Matches rune_. |
| 103 | kRegexpLiteral, |
| 104 | |
| 105 | // Matches runes_. |
| 106 | kRegexpLiteralString, |
| 107 | |
| 108 | // Matches concatenation of sub_[0..nsub-1]. |
| 109 | kRegexpConcat, |
| 110 | // Matches union of sub_[0..nsub-1]. |
| 111 | kRegexpAlternate, |
| 112 | |
| 113 | // Matches sub_[0] zero or more times. |
| 114 | kRegexpStar, |
| 115 | // Matches sub_[0] one or more times. |
| 116 | kRegexpPlus, |
| 117 | // Matches sub_[0] zero or one times. |
| 118 | kRegexpQuest, |
| 119 | |
| 120 | // Matches sub_[0] at least min_ times, at most max_ times. |
| 121 | // max_ == -1 means no upper limit. |
| 122 | kRegexpRepeat, |
| 123 | |
| 124 | // Parenthesized (capturing) subexpression. Index is cap_. |
| 125 | // Optionally, capturing name is name_. |
| 126 | kRegexpCapture, |
| 127 | |
| 128 | // Matches any character. |
| 129 | kRegexpAnyChar, |
| 130 | |
| 131 | // Matches any byte [sic]. |
| 132 | kRegexpAnyByte, |
| 133 | |
| 134 | // Matches empty string at beginning of line. |
| 135 | kRegexpBeginLine, |
| 136 | // Matches empty string at end of line. |
| 137 | kRegexpEndLine, |
| 138 | |
| 139 | // Matches word boundary "\b". |
| 140 | kRegexpWordBoundary, |
| 141 | // Matches not-a-word boundary "\B". |
| 142 | kRegexpNoWordBoundary, |
| 143 | |
| 144 | // Matches empty string at beginning of text. |
| 145 | kRegexpBeginText, |
| 146 | // Matches empty string at end of text. |
| 147 | kRegexpEndText, |
| 148 | |
| 149 | // Matches character class given by cc_. |
| 150 | kRegexpCharClass, |
| 151 | |
| 152 | // Forces match of entire expression right now, |
| 153 | // with match ID match_id_ (used by RE2::Set). |
| 154 | kRegexpHaveMatch, |
| 155 | |
| 156 | kMaxRegexpOp = kRegexpHaveMatch, |
| 157 | }; |
| 158 | |
| 159 | // Keep in sync with string list in regexp.cc |
| 160 | enum RegexpStatusCode { |
| 161 | // No error |
| 162 | kRegexpSuccess = 0, |
| 163 | |
| 164 | // Unexpected error |
| 165 | kRegexpInternalError, |
| 166 | |
| 167 | // Parse errors |
| 168 | kRegexpBadEscape, // bad escape sequence |
| 169 | kRegexpBadCharClass, // bad character class |
| 170 | kRegexpBadCharRange, // bad character class range |
| 171 | kRegexpMissingBracket, // missing closing ] |
| 172 | kRegexpMissingParen, // missing closing ) |
| 173 | kRegexpTrailingBackslash, // at end of regexp |
| 174 | kRegexpRepeatArgument, // repeat argument missing, e.g. "*" |
| 175 | kRegexpRepeatSize, // bad repetition argument |
| 176 | kRegexpRepeatOp, // bad repetition operator |
| 177 | kRegexpBadPerlOp, // bad perl operator |
| 178 | kRegexpBadUTF8, // invalid UTF-8 in regexp |
| 179 | kRegexpBadNamedCapture, // bad named capture |
| 180 | }; |
| 181 | |
| 182 | // Error status for certain operations. |
| 183 | class RegexpStatus { |
| 184 | public: |
| 185 | RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {} |
| 186 | ~RegexpStatus() { delete tmp_; } |
| 187 | |
| 188 | void set_code(enum RegexpStatusCode code) { code_ = code; } |
| 189 | void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; } |
| 190 | void set_tmp(string* tmp) { delete tmp_; tmp_ = tmp; } |
| 191 | enum RegexpStatusCode code() const { return code_; } |
| 192 | const StringPiece& error_arg() const { return error_arg_; } |
| 193 | bool ok() const { return code() == kRegexpSuccess; } |
| 194 | |
| 195 | // Copies state from status. |
| 196 | void Copy(const RegexpStatus& status); |
| 197 | |
| 198 | // Returns text equivalent of code, e.g.: |
| 199 | // "Bad character class" |
Alexander Gutkin | 0d4c523 | 2013-02-28 13:47:27 +0000 | [diff] [blame] | 200 | static string CodeText(enum RegexpStatusCode code); |
Ian Hodson | 2ee91b4 | 2012-05-14 12:29:36 +0100 | [diff] [blame] | 201 | |
| 202 | // Returns text describing error, e.g.: |
| 203 | // "Bad character class: [z-a]" |
| 204 | string Text() const; |
| 205 | |
| 206 | private: |
| 207 | enum RegexpStatusCode code_; // Kind of error |
| 208 | StringPiece error_arg_; // Piece of regexp containing syntax error. |
| 209 | string* tmp_; // Temporary storage, possibly where error_arg_ is. |
| 210 | |
| 211 | DISALLOW_EVIL_CONSTRUCTORS(RegexpStatus); |
| 212 | }; |
| 213 | |
| 214 | // Walker to implement Simplify. |
| 215 | class SimplifyWalker; |
| 216 | |
| 217 | // Compiled form; see prog.h |
| 218 | class Prog; |
| 219 | |
| 220 | struct RuneRange { |
| 221 | RuneRange() : lo(0), hi(0) { } |
| 222 | RuneRange(int l, int h) : lo(l), hi(h) { } |
| 223 | Rune lo; |
| 224 | Rune hi; |
| 225 | }; |
| 226 | |
| 227 | // Less-than on RuneRanges treats a == b if they overlap at all. |
| 228 | // This lets us look in a set to find the range covering a particular Rune. |
| 229 | struct RuneRangeLess { |
| 230 | bool operator()(const RuneRange& a, const RuneRange& b) const { |
| 231 | return a.hi < b.lo; |
| 232 | } |
| 233 | }; |
| 234 | |
| 235 | class CharClassBuilder; |
| 236 | |
| 237 | class CharClass { |
| 238 | public: |
| 239 | void Delete(); |
| 240 | |
| 241 | typedef RuneRange* iterator; |
| 242 | iterator begin() { return ranges_; } |
| 243 | iterator end() { return ranges_ + nranges_; } |
| 244 | |
| 245 | int size() { return nrunes_; } |
| 246 | bool empty() { return nrunes_ == 0; } |
| 247 | bool full() { return nrunes_ == Runemax+1; } |
| 248 | bool FoldsASCII() { return folds_ascii_; } |
| 249 | |
| 250 | bool Contains(Rune r); |
| 251 | CharClass* Negate(); |
| 252 | |
| 253 | private: |
| 254 | CharClass(); // not implemented |
| 255 | ~CharClass(); // not implemented |
| 256 | static CharClass* New(int maxranges); |
| 257 | |
| 258 | friend class CharClassBuilder; |
| 259 | |
| 260 | bool folds_ascii_; |
| 261 | int nrunes_; |
| 262 | RuneRange *ranges_; |
| 263 | int nranges_; |
| 264 | DISALLOW_EVIL_CONSTRUCTORS(CharClass); |
| 265 | }; |
| 266 | |
| 267 | class Regexp { |
| 268 | public: |
| 269 | |
| 270 | // Flags for parsing. Can be ORed together. |
| 271 | enum ParseFlags { |
| 272 | NoParseFlags = 0, |
| 273 | FoldCase = 1<<0, // Fold case during matching (case-insensitive). |
| 274 | Literal = 1<<1, // Treat s as literal string instead of a regexp. |
| 275 | ClassNL = 1<<2, // Allow char classes like [^a-z] and \D and \s |
| 276 | // and [[:space:]] to match newline. |
| 277 | DotNL = 1<<3, // Allow . to match newline. |
| 278 | MatchNL = ClassNL | DotNL, |
| 279 | OneLine = 1<<4, // Treat ^ and $ as only matching at beginning and |
| 280 | // end of text, not around embedded newlines. |
| 281 | // (Perl's default) |
| 282 | Latin1 = 1<<5, // Regexp and text are in Latin1, not UTF-8. |
| 283 | NonGreedy = 1<<6, // Repetition operators are non-greedy by default. |
| 284 | PerlClasses = 1<<7, // Allow Perl character classes like \d. |
| 285 | PerlB = 1<<8, // Allow Perl's \b and \B. |
| 286 | PerlX = 1<<9, // Perl extensions: |
| 287 | // non-capturing parens - (?: ) |
| 288 | // non-greedy operators - *? +? ?? {}? |
| 289 | // flag edits - (?i) (?-i) (?i: ) |
| 290 | // i - FoldCase |
| 291 | // m - !OneLine |
| 292 | // s - DotNL |
| 293 | // U - NonGreedy |
| 294 | // line ends: \A \z |
| 295 | // \Q and \E to disable/enable metacharacters |
| 296 | // (?P<name>expr) for named captures |
| 297 | // \C to match any single byte |
| 298 | UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group |
| 299 | // and \P{Han} for its negation. |
| 300 | NeverNL = 1<<11, // Never match NL, even if the regexp mentions |
| 301 | // it explicitly. |
Alexander Gutkin | 0d4c523 | 2013-02-28 13:47:27 +0000 | [diff] [blame] | 302 | NeverCapture = 1<<12, // Parse all parens as non-capturing. |
Ian Hodson | 2ee91b4 | 2012-05-14 12:29:36 +0100 | [diff] [blame] | 303 | |
| 304 | // As close to Perl as we can get. |
| 305 | LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX | |
| 306 | UnicodeGroups, |
| 307 | |
| 308 | // Internal use only. |
| 309 | WasDollar = 1<<15, // on kRegexpEndText: was $ in regexp text |
| 310 | }; |
| 311 | |
| 312 | // Get. No set, Regexps are logically immutable once created. |
| 313 | RegexpOp op() { return static_cast<RegexpOp>(op_); } |
| 314 | int nsub() { return nsub_; } |
| 315 | bool simple() { return simple_; } |
| 316 | enum ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); } |
| 317 | int Ref(); // For testing. |
| 318 | |
| 319 | Regexp** sub() { |
| 320 | if(nsub_ <= 1) |
| 321 | return &subone_; |
| 322 | else |
| 323 | return submany_; |
| 324 | } |
| 325 | |
| 326 | int min() { DCHECK_EQ(op_, kRegexpRepeat); return min_; } |
| 327 | int max() { DCHECK_EQ(op_, kRegexpRepeat); return max_; } |
| 328 | Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; } |
| 329 | CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; } |
| 330 | int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; } |
| 331 | const string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; } |
| 332 | Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; } |
| 333 | int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; } |
| 334 | int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; } |
| 335 | |
| 336 | // Increments reference count, returns object as convenience. |
| 337 | Regexp* Incref(); |
| 338 | |
| 339 | // Decrements reference count and deletes this object if count reaches 0. |
| 340 | void Decref(); |
| 341 | |
| 342 | // Parses string s to produce regular expression, returned. |
| 343 | // Caller must release return value with re->Decref(). |
| 344 | // On failure, sets *status (if status != NULL) and returns NULL. |
| 345 | static Regexp* Parse(const StringPiece& s, ParseFlags flags, |
| 346 | RegexpStatus* status); |
| 347 | |
| 348 | // Returns a _new_ simplified version of the current regexp. |
| 349 | // Does not edit the current regexp. |
| 350 | // Caller must release return value with re->Decref(). |
| 351 | // Simplified means that counted repetition has been rewritten |
| 352 | // into simpler terms and all Perl/POSIX features have been |
| 353 | // removed. The result will capture exactly the same |
| 354 | // subexpressions the original did, unless formatted with ToString. |
| 355 | Regexp* Simplify(); |
| 356 | friend class SimplifyWalker; |
| 357 | |
| 358 | // Parses the regexp src and then simplifies it and sets *dst to the |
| 359 | // string representation of the simplified form. Returns true on success. |
| 360 | // Returns false and sets *status (if status != NULL) on parse error. |
| 361 | static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags, |
| 362 | string* dst, |
| 363 | RegexpStatus* status); |
| 364 | |
| 365 | // Returns the number of capturing groups in the regexp. |
| 366 | int NumCaptures(); |
| 367 | friend class NumCapturesWalker; |
| 368 | |
| 369 | // Returns a map from names to capturing group indices, |
| 370 | // or NULL if the regexp contains no named capture groups. |
| 371 | // The caller is responsible for deleting the map. |
| 372 | map<string, int>* NamedCaptures(); |
| 373 | |
| 374 | // Returns a map from capturing group indices to capturing group |
| 375 | // names or NULL if the regexp contains no named capture groups. The |
| 376 | // caller is responsible for deleting the map. |
| 377 | map<int, string>* CaptureNames(); |
| 378 | |
| 379 | // Returns a string representation of the current regexp, |
| 380 | // using as few parentheses as possible. |
| 381 | string ToString(); |
| 382 | |
| 383 | // Convenience functions. They consume the passed reference, |
| 384 | // so in many cases you should use, e.g., Plus(re->Incref(), flags). |
| 385 | // They do not consume allocated arrays like subs or runes. |
| 386 | static Regexp* Plus(Regexp* sub, ParseFlags flags); |
| 387 | static Regexp* Star(Regexp* sub, ParseFlags flags); |
| 388 | static Regexp* Quest(Regexp* sub, ParseFlags flags); |
| 389 | static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags); |
| 390 | static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags); |
| 391 | static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap); |
| 392 | static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max); |
| 393 | static Regexp* NewLiteral(Rune rune, ParseFlags flags); |
| 394 | static Regexp* NewCharClass(CharClass* cc, ParseFlags flags); |
| 395 | static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags); |
| 396 | static Regexp* HaveMatch(int match_id, ParseFlags flags); |
| 397 | |
| 398 | // Like Alternate but does not factor out common prefixes. |
| 399 | static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags); |
| 400 | |
| 401 | // Debugging function. Returns string format for regexp |
| 402 | // that makes structure clear. Does NOT use regexp syntax. |
| 403 | string Dump(); |
| 404 | |
| 405 | // Helper traversal class, defined fully in walker-inl.h. |
| 406 | template<typename T> class Walker; |
| 407 | |
| 408 | // Compile to Prog. See prog.h |
| 409 | // Reverse prog expects to be run over text backward. |
| 410 | // Construction and execution of prog will |
| 411 | // stay within approximately max_mem bytes of memory. |
| 412 | // If max_mem <= 0, a reasonable default is used. |
| 413 | Prog* CompileToProg(int64 max_mem); |
| 414 | Prog* CompileToReverseProg(int64 max_mem); |
| 415 | |
| 416 | // Whether to expect this library to find exactly the same answer as PCRE |
| 417 | // when running this regexp. Most regexps do mimic PCRE exactly, but a few |
| 418 | // obscure cases behave differently. Technically this is more a property |
| 419 | // of the Prog than the Regexp, but the computation is much easier to do |
| 420 | // on the Regexp. See mimics_pcre.cc for the exact conditions. |
| 421 | bool MimicsPCRE(); |
| 422 | |
| 423 | // Benchmarking function. |
| 424 | void NullWalk(); |
| 425 | |
| 426 | // Whether every match of this regexp must be anchored and |
| 427 | // begin with a non-empty fixed string (perhaps after ASCII |
| 428 | // case-folding). If so, returns the prefix and the sub-regexp that |
| 429 | // follows it. |
| 430 | bool RequiredPrefix(string* prefix, bool *foldcase, Regexp** suffix); |
| 431 | |
| 432 | private: |
| 433 | // Constructor allocates vectors as appropriate for operator. |
| 434 | explicit Regexp(RegexpOp op, ParseFlags parse_flags); |
| 435 | |
| 436 | // Use Decref() instead of delete to release Regexps. |
| 437 | // This is private to catch deletes at compile time. |
| 438 | ~Regexp(); |
| 439 | void Destroy(); |
| 440 | bool QuickDestroy(); |
| 441 | |
| 442 | // Helpers for Parse. Listed here so they can edit Regexps. |
| 443 | class ParseState; |
| 444 | friend class ParseState; |
| 445 | friend bool ParseCharClass(StringPiece* s, Regexp** out_re, |
| 446 | RegexpStatus* status); |
| 447 | |
| 448 | // Helper for testing [sic]. |
| 449 | friend bool RegexpEqualTestingOnly(Regexp*, Regexp*); |
| 450 | |
| 451 | // Computes whether Regexp is already simple. |
| 452 | bool ComputeSimple(); |
| 453 | |
| 454 | // Constructor that generates a concatenation or alternation, |
| 455 | // enforcing the limit on the number of subexpressions for |
| 456 | // a particular Regexp. |
| 457 | static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs, |
| 458 | ParseFlags flags, bool can_factor); |
| 459 | |
| 460 | // Returns the leading string that re starts with. |
| 461 | // The returned Rune* points into a piece of re, |
| 462 | // so it must not be used after the caller calls re->Decref(). |
| 463 | static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags); |
| 464 | |
| 465 | // Removes the first n leading runes from the beginning of re. |
| 466 | // Edits re in place. |
| 467 | static void RemoveLeadingString(Regexp* re, int n); |
| 468 | |
| 469 | // Returns the leading regexp in re's top-level concatenation. |
| 470 | // The returned Regexp* points at re or a sub-expression of re, |
| 471 | // so it must not be used after the caller calls re->Decref(). |
| 472 | static Regexp* LeadingRegexp(Regexp* re); |
| 473 | |
| 474 | // Removes LeadingRegexp(re) from re and returns the remainder. |
| 475 | // Might edit re in place. |
| 476 | static Regexp* RemoveLeadingRegexp(Regexp* re); |
| 477 | |
| 478 | // Simplifies an alternation of literal strings by factoring out |
| 479 | // common prefixes. |
| 480 | static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags); |
| 481 | static int FactorAlternationRecursive(Regexp** sub, int nsub, |
| 482 | ParseFlags flags, int maxdepth); |
| 483 | |
| 484 | // Is a == b? Only efficient on regexps that have not been through |
| 485 | // Simplify yet - the expansion of a kRegexpRepeat will make this |
| 486 | // take a long time. Do not call on such regexps, hence private. |
| 487 | static bool Equal(Regexp* a, Regexp* b); |
| 488 | |
| 489 | // Allocate space for n sub-regexps. |
| 490 | void AllocSub(int n) { |
| 491 | if (n < 0 || static_cast<uint16>(n) != n) |
| 492 | LOG(FATAL) << "Cannot AllocSub " << n; |
| 493 | if (n > 1) |
| 494 | submany_ = new Regexp*[n]; |
| 495 | nsub_ = n; |
| 496 | } |
| 497 | |
| 498 | // Add Rune to LiteralString |
| 499 | void AddRuneToString(Rune r); |
| 500 | |
| 501 | // Swaps this with that, in place. |
| 502 | void Swap(Regexp *that); |
| 503 | |
| 504 | // Operator. See description of operators above. |
| 505 | // uint8 instead of RegexpOp to control space usage. |
| 506 | uint8 op_; |
| 507 | |
| 508 | // Is this regexp structure already simple |
| 509 | // (has it been returned by Simplify)? |
| 510 | // uint8 instead of bool to control space usage. |
| 511 | uint8 simple_; |
| 512 | |
| 513 | // Flags saved from parsing and used during execution. |
| 514 | // (Only FoldCase is used.) |
| 515 | // uint16 instead of ParseFlags to control space usage. |
| 516 | uint16 parse_flags_; |
| 517 | |
| 518 | // Reference count. Exists so that SimplifyRegexp can build |
| 519 | // regexp structures that are dags rather than trees to avoid |
| 520 | // exponential blowup in space requirements. |
| 521 | // uint16 to control space usage. |
| 522 | // The standard regexp routines will never generate a |
| 523 | // ref greater than the maximum repeat count (100), |
| 524 | // but even so, Incref and Decref consult an overflow map |
| 525 | // when ref_ reaches kMaxRef. |
| 526 | uint16 ref_; |
| 527 | static const uint16 kMaxRef = 0xffff; |
| 528 | |
| 529 | // Subexpressions. |
| 530 | // uint16 to control space usage. |
| 531 | // Concat and Alternate handle larger numbers of subexpressions |
| 532 | // by building concatenation or alternation trees. |
| 533 | // Other routines should call Concat or Alternate instead of |
| 534 | // filling in sub() by hand. |
| 535 | uint16 nsub_; |
| 536 | static const uint16 kMaxNsub = 0xffff; |
| 537 | union { |
| 538 | Regexp** submany_; // if nsub_ > 1 |
| 539 | Regexp* subone_; // if nsub_ == 1 |
| 540 | }; |
| 541 | |
| 542 | // Extra space for parse and teardown stacks. |
| 543 | Regexp* down_; |
| 544 | |
| 545 | // Arguments to operator. See description of operators above. |
| 546 | union { |
| 547 | struct { // Repeat |
| 548 | int max_; |
| 549 | int min_; |
| 550 | }; |
| 551 | struct { // Capture |
| 552 | int cap_; |
| 553 | string* name_; |
| 554 | }; |
| 555 | struct { // LiteralString |
| 556 | int nrunes_; |
| 557 | Rune* runes_; |
| 558 | }; |
| 559 | struct { // CharClass |
| 560 | // These two could be in separate union members, |
| 561 | // but it wouldn't save any space (there are other two-word structs) |
| 562 | // and keeping them separate avoids confusion during parsing. |
| 563 | CharClass* cc_; |
| 564 | CharClassBuilder* ccb_; |
| 565 | }; |
| 566 | Rune rune_; // Literal |
| 567 | int match_id_; // HaveMatch |
| 568 | void *the_union_[2]; // as big as any other element, for memset |
| 569 | }; |
| 570 | |
| 571 | DISALLOW_EVIL_CONSTRUCTORS(Regexp); |
| 572 | }; |
| 573 | |
| 574 | // Character class set: contains non-overlapping, non-abutting RuneRanges. |
| 575 | typedef set<RuneRange, RuneRangeLess> RuneRangeSet; |
| 576 | |
| 577 | class CharClassBuilder { |
| 578 | public: |
| 579 | CharClassBuilder(); |
| 580 | |
| 581 | typedef RuneRangeSet::iterator iterator; |
| 582 | iterator begin() { return ranges_.begin(); } |
| 583 | iterator end() { return ranges_.end(); } |
| 584 | |
| 585 | int size() { return nrunes_; } |
| 586 | bool empty() { return nrunes_ == 0; } |
| 587 | bool full() { return nrunes_ == Runemax+1; } |
| 588 | |
| 589 | bool Contains(Rune r); |
| 590 | bool FoldsASCII(); |
| 591 | bool AddRange(Rune lo, Rune hi); // returns whether class changed |
| 592 | CharClassBuilder* Copy(); |
| 593 | void AddCharClass(CharClassBuilder* cc); |
| 594 | void Negate(); |
| 595 | void RemoveAbove(Rune r); |
| 596 | CharClass* GetCharClass(); |
| 597 | void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags); |
| 598 | |
| 599 | private: |
| 600 | static const uint32 AlphaMask = (1<<26) - 1; |
| 601 | uint32 upper_; // bitmap of A-Z |
| 602 | uint32 lower_; // bitmap of a-z |
| 603 | int nrunes_; |
| 604 | RuneRangeSet ranges_; |
| 605 | DISALLOW_EVIL_CONSTRUCTORS(CharClassBuilder); |
| 606 | }; |
| 607 | |
| 608 | // Tell g++ that bitwise ops on ParseFlags produce ParseFlags. |
| 609 | inline Regexp::ParseFlags operator|(Regexp::ParseFlags a, Regexp::ParseFlags b) |
| 610 | { |
| 611 | return static_cast<Regexp::ParseFlags>(static_cast<int>(a) | static_cast<int>(b)); |
| 612 | } |
| 613 | |
| 614 | inline Regexp::ParseFlags operator^(Regexp::ParseFlags a, Regexp::ParseFlags b) |
| 615 | { |
| 616 | return static_cast<Regexp::ParseFlags>(static_cast<int>(a) ^ static_cast<int>(b)); |
| 617 | } |
| 618 | |
| 619 | inline Regexp::ParseFlags operator&(Regexp::ParseFlags a, Regexp::ParseFlags b) |
| 620 | { |
| 621 | return static_cast<Regexp::ParseFlags>(static_cast<int>(a) & static_cast<int>(b)); |
| 622 | } |
| 623 | |
| 624 | inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) |
| 625 | { |
| 626 | return static_cast<Regexp::ParseFlags>(~static_cast<int>(a)); |
| 627 | } |
| 628 | |
| 629 | |
| 630 | |
| 631 | } // namespace re2 |
| 632 | |
| 633 | #endif // RE2_REGEXP_H__ |