blob: 331c017673c42574fcfff9f73ef181fe3ff0cadf [file] [log] [blame]
Ian Hodson2ee91b42012-05-14 12:29:36 +01001// Copyright 2006 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// --- SPONSORED LINK --------------------------------------------------
6// If you want to use this library for regular expression matching,
7// you should use re2/re2.h, which provides a class RE2 that
8// mimics the PCRE interface provided by PCRE's C++ wrappers.
9// This header describes the low-level interface used to implement RE2
10// and may change in backwards-incompatible ways from time to time.
11// In contrast, RE2's interface will not.
12// ---------------------------------------------------------------------
13
14// Regular expression library: parsing, execution, and manipulation
15// of regular expressions.
16//
17// Any operation that traverses the Regexp structures should be written
18// using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested
19// regular expressions such as x++++++++++++++++++++... might cause recursive
20// traversals to overflow the stack.
21//
22// It is the caller's responsibility to provide appropriate mutual exclusion
23// around manipulation of the regexps. RE2 does this.
24//
25// PARSING
26//
27// Regexp::Parse parses regular expressions encoded in UTF-8.
28// The default syntax is POSIX extended regular expressions,
29// with the following changes:
30//
31// 1. Backreferences (optional in POSIX EREs) are not supported.
32// (Supporting them precludes the use of DFA-based
33// matching engines.)
34//
35// 2. Collating elements and collation classes are not supported.
36// (No one has needed or wanted them.)
37//
38// The exact syntax accepted can be modified by passing flags to
39// Regexp::Parse. In particular, many of the basic Perl additions
40// are available. The flags are documented below (search for LikePerl).
41//
42// If parsed with the flag Regexp::Latin1, both the regular expression
43// and the input to the matching routines are assumed to be encoded in
44// Latin-1, not UTF-8.
45//
46// EXECUTION
47//
48// Once Regexp has parsed a regular expression, it provides methods
49// to search text using that regular expression. These methods are
50// implemented via calling out to other regular expression libraries.
51// (Let's call them the sublibraries.)
52//
53// To call a sublibrary, Regexp does not simply prepare a
54// string version of the regular expression and hand it to the
55// sublibrary. Instead, Regexp prepares, from its own parsed form, the
56// corresponding internal representation used by the sublibrary.
57// This has the drawback of needing to know the internal representation
58// used by the sublibrary, but it has two important benefits:
59//
60// 1. The syntax and meaning of regular expressions is guaranteed
61// to be that used by Regexp's parser, not the syntax expected
62// by the sublibrary. Regexp might accept a restricted or
63// expanded syntax for regular expressions as compared with
64// the sublibrary. As long as Regexp can translate from its
65// internal form into the sublibrary's, clients need not know
66// exactly which sublibrary they are using.
67//
68// 2. The sublibrary parsers are bypassed. For whatever reason,
69// sublibrary regular expression parsers often have security
70// problems. For example, plan9grep's regular expression parser
71// has a buffer overflow in its handling of large character
72// classes, and PCRE's parser has had buffer overflow problems
73// in the past. Security-team requires sandboxing of sublibrary
74// regular expression parsers. Avoiding the sublibrary parsers
75// avoids the sandbox.
76//
77// The execution methods we use now are provided by the compiled form,
78// Prog, described in prog.h
79//
80// MANIPULATION
81//
82// Unlike other regular expression libraries, Regexp makes its parsed
83// form accessible to clients, so that client code can analyze the
84// parsed regular expressions.
85
86#ifndef RE2_REGEXP_H__
87#define RE2_REGEXP_H__
88
89#include "util/util.h"
90#include "re2/stringpiece.h"
91
92namespace re2 {
93
94// Keep in sync with string list kOpcodeNames[] in testing/dump.cc
95enum RegexpOp {
96 // Matches no strings.
97 kRegexpNoMatch = 1,
98
99 // Matches empty string.
100 kRegexpEmptyMatch,
101
102 // Matches rune_.
103 kRegexpLiteral,
104
105 // Matches runes_.
106 kRegexpLiteralString,
107
108 // Matches concatenation of sub_[0..nsub-1].
109 kRegexpConcat,
110 // Matches union of sub_[0..nsub-1].
111 kRegexpAlternate,
112
113 // Matches sub_[0] zero or more times.
114 kRegexpStar,
115 // Matches sub_[0] one or more times.
116 kRegexpPlus,
117 // Matches sub_[0] zero or one times.
118 kRegexpQuest,
119
120 // Matches sub_[0] at least min_ times, at most max_ times.
121 // max_ == -1 means no upper limit.
122 kRegexpRepeat,
123
124 // Parenthesized (capturing) subexpression. Index is cap_.
125 // Optionally, capturing name is name_.
126 kRegexpCapture,
127
128 // Matches any character.
129 kRegexpAnyChar,
130
131 // Matches any byte [sic].
132 kRegexpAnyByte,
133
134 // Matches empty string at beginning of line.
135 kRegexpBeginLine,
136 // Matches empty string at end of line.
137 kRegexpEndLine,
138
139 // Matches word boundary "\b".
140 kRegexpWordBoundary,
141 // Matches not-a-word boundary "\B".
142 kRegexpNoWordBoundary,
143
144 // Matches empty string at beginning of text.
145 kRegexpBeginText,
146 // Matches empty string at end of text.
147 kRegexpEndText,
148
149 // Matches character class given by cc_.
150 kRegexpCharClass,
151
152 // Forces match of entire expression right now,
153 // with match ID match_id_ (used by RE2::Set).
154 kRegexpHaveMatch,
155
156 kMaxRegexpOp = kRegexpHaveMatch,
157};
158
159// Keep in sync with string list in regexp.cc
160enum RegexpStatusCode {
161 // No error
162 kRegexpSuccess = 0,
163
164 // Unexpected error
165 kRegexpInternalError,
166
167 // Parse errors
168 kRegexpBadEscape, // bad escape sequence
169 kRegexpBadCharClass, // bad character class
170 kRegexpBadCharRange, // bad character class range
171 kRegexpMissingBracket, // missing closing ]
172 kRegexpMissingParen, // missing closing )
173 kRegexpTrailingBackslash, // at end of regexp
174 kRegexpRepeatArgument, // repeat argument missing, e.g. "*"
175 kRegexpRepeatSize, // bad repetition argument
176 kRegexpRepeatOp, // bad repetition operator
177 kRegexpBadPerlOp, // bad perl operator
178 kRegexpBadUTF8, // invalid UTF-8 in regexp
179 kRegexpBadNamedCapture, // bad named capture
180};
181
182// Error status for certain operations.
183class RegexpStatus {
184 public:
185 RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
186 ~RegexpStatus() { delete tmp_; }
187
188 void set_code(enum RegexpStatusCode code) { code_ = code; }
189 void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; }
190 void set_tmp(string* tmp) { delete tmp_; tmp_ = tmp; }
191 enum RegexpStatusCode code() const { return code_; }
192 const StringPiece& error_arg() const { return error_arg_; }
193 bool ok() const { return code() == kRegexpSuccess; }
194
195 // Copies state from status.
196 void Copy(const RegexpStatus& status);
197
198 // Returns text equivalent of code, e.g.:
199 // "Bad character class"
Alexander Gutkin0d4c5232013-02-28 13:47:27 +0000200 static string CodeText(enum RegexpStatusCode code);
Ian Hodson2ee91b42012-05-14 12:29:36 +0100201
202 // Returns text describing error, e.g.:
203 // "Bad character class: [z-a]"
204 string Text() const;
205
206 private:
207 enum RegexpStatusCode code_; // Kind of error
208 StringPiece error_arg_; // Piece of regexp containing syntax error.
209 string* tmp_; // Temporary storage, possibly where error_arg_ is.
210
211 DISALLOW_EVIL_CONSTRUCTORS(RegexpStatus);
212};
213
214// Walker to implement Simplify.
215class SimplifyWalker;
216
217// Compiled form; see prog.h
218class Prog;
219
220struct RuneRange {
221 RuneRange() : lo(0), hi(0) { }
222 RuneRange(int l, int h) : lo(l), hi(h) { }
223 Rune lo;
224 Rune hi;
225};
226
227// Less-than on RuneRanges treats a == b if they overlap at all.
228// This lets us look in a set to find the range covering a particular Rune.
229struct RuneRangeLess {
230 bool operator()(const RuneRange& a, const RuneRange& b) const {
231 return a.hi < b.lo;
232 }
233};
234
235class CharClassBuilder;
236
237class CharClass {
238 public:
239 void Delete();
240
241 typedef RuneRange* iterator;
242 iterator begin() { return ranges_; }
243 iterator end() { return ranges_ + nranges_; }
244
245 int size() { return nrunes_; }
246 bool empty() { return nrunes_ == 0; }
247 bool full() { return nrunes_ == Runemax+1; }
248 bool FoldsASCII() { return folds_ascii_; }
249
250 bool Contains(Rune r);
251 CharClass* Negate();
252
253 private:
254 CharClass(); // not implemented
255 ~CharClass(); // not implemented
256 static CharClass* New(int maxranges);
257
258 friend class CharClassBuilder;
259
260 bool folds_ascii_;
261 int nrunes_;
262 RuneRange *ranges_;
263 int nranges_;
264 DISALLOW_EVIL_CONSTRUCTORS(CharClass);
265};
266
267class Regexp {
268 public:
269
270 // Flags for parsing. Can be ORed together.
271 enum ParseFlags {
272 NoParseFlags = 0,
273 FoldCase = 1<<0, // Fold case during matching (case-insensitive).
274 Literal = 1<<1, // Treat s as literal string instead of a regexp.
275 ClassNL = 1<<2, // Allow char classes like [^a-z] and \D and \s
276 // and [[:space:]] to match newline.
277 DotNL = 1<<3, // Allow . to match newline.
278 MatchNL = ClassNL | DotNL,
279 OneLine = 1<<4, // Treat ^ and $ as only matching at beginning and
280 // end of text, not around embedded newlines.
281 // (Perl's default)
282 Latin1 = 1<<5, // Regexp and text are in Latin1, not UTF-8.
283 NonGreedy = 1<<6, // Repetition operators are non-greedy by default.
284 PerlClasses = 1<<7, // Allow Perl character classes like \d.
285 PerlB = 1<<8, // Allow Perl's \b and \B.
286 PerlX = 1<<9, // Perl extensions:
287 // non-capturing parens - (?: )
288 // non-greedy operators - *? +? ?? {}?
289 // flag edits - (?i) (?-i) (?i: )
290 // i - FoldCase
291 // m - !OneLine
292 // s - DotNL
293 // U - NonGreedy
294 // line ends: \A \z
295 // \Q and \E to disable/enable metacharacters
296 // (?P<name>expr) for named captures
297 // \C to match any single byte
298 UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
299 // and \P{Han} for its negation.
300 NeverNL = 1<<11, // Never match NL, even if the regexp mentions
301 // it explicitly.
Alexander Gutkin0d4c5232013-02-28 13:47:27 +0000302 NeverCapture = 1<<12, // Parse all parens as non-capturing.
Ian Hodson2ee91b42012-05-14 12:29:36 +0100303
304 // As close to Perl as we can get.
305 LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX |
306 UnicodeGroups,
307
308 // Internal use only.
309 WasDollar = 1<<15, // on kRegexpEndText: was $ in regexp text
310 };
311
312 // Get. No set, Regexps are logically immutable once created.
313 RegexpOp op() { return static_cast<RegexpOp>(op_); }
314 int nsub() { return nsub_; }
315 bool simple() { return simple_; }
316 enum ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
317 int Ref(); // For testing.
318
319 Regexp** sub() {
320 if(nsub_ <= 1)
321 return &subone_;
322 else
323 return submany_;
324 }
325
326 int min() { DCHECK_EQ(op_, kRegexpRepeat); return min_; }
327 int max() { DCHECK_EQ(op_, kRegexpRepeat); return max_; }
328 Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; }
329 CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; }
330 int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; }
331 const string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; }
332 Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; }
333 int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; }
334 int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; }
335
336 // Increments reference count, returns object as convenience.
337 Regexp* Incref();
338
339 // Decrements reference count and deletes this object if count reaches 0.
340 void Decref();
341
342 // Parses string s to produce regular expression, returned.
343 // Caller must release return value with re->Decref().
344 // On failure, sets *status (if status != NULL) and returns NULL.
345 static Regexp* Parse(const StringPiece& s, ParseFlags flags,
346 RegexpStatus* status);
347
348 // Returns a _new_ simplified version of the current regexp.
349 // Does not edit the current regexp.
350 // Caller must release return value with re->Decref().
351 // Simplified means that counted repetition has been rewritten
352 // into simpler terms and all Perl/POSIX features have been
353 // removed. The result will capture exactly the same
354 // subexpressions the original did, unless formatted with ToString.
355 Regexp* Simplify();
356 friend class SimplifyWalker;
357
358 // Parses the regexp src and then simplifies it and sets *dst to the
359 // string representation of the simplified form. Returns true on success.
360 // Returns false and sets *status (if status != NULL) on parse error.
361 static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags,
362 string* dst,
363 RegexpStatus* status);
364
365 // Returns the number of capturing groups in the regexp.
366 int NumCaptures();
367 friend class NumCapturesWalker;
368
369 // Returns a map from names to capturing group indices,
370 // or NULL if the regexp contains no named capture groups.
371 // The caller is responsible for deleting the map.
372 map<string, int>* NamedCaptures();
373
374 // Returns a map from capturing group indices to capturing group
375 // names or NULL if the regexp contains no named capture groups. The
376 // caller is responsible for deleting the map.
377 map<int, string>* CaptureNames();
378
379 // Returns a string representation of the current regexp,
380 // using as few parentheses as possible.
381 string ToString();
382
383 // Convenience functions. They consume the passed reference,
384 // so in many cases you should use, e.g., Plus(re->Incref(), flags).
385 // They do not consume allocated arrays like subs or runes.
386 static Regexp* Plus(Regexp* sub, ParseFlags flags);
387 static Regexp* Star(Regexp* sub, ParseFlags flags);
388 static Regexp* Quest(Regexp* sub, ParseFlags flags);
389 static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags);
390 static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags);
391 static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap);
392 static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max);
393 static Regexp* NewLiteral(Rune rune, ParseFlags flags);
394 static Regexp* NewCharClass(CharClass* cc, ParseFlags flags);
395 static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags);
396 static Regexp* HaveMatch(int match_id, ParseFlags flags);
397
398 // Like Alternate but does not factor out common prefixes.
399 static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags);
400
401 // Debugging function. Returns string format for regexp
402 // that makes structure clear. Does NOT use regexp syntax.
403 string Dump();
404
405 // Helper traversal class, defined fully in walker-inl.h.
406 template<typename T> class Walker;
407
408 // Compile to Prog. See prog.h
409 // Reverse prog expects to be run over text backward.
410 // Construction and execution of prog will
411 // stay within approximately max_mem bytes of memory.
412 // If max_mem <= 0, a reasonable default is used.
413 Prog* CompileToProg(int64 max_mem);
414 Prog* CompileToReverseProg(int64 max_mem);
415
416 // Whether to expect this library to find exactly the same answer as PCRE
417 // when running this regexp. Most regexps do mimic PCRE exactly, but a few
418 // obscure cases behave differently. Technically this is more a property
419 // of the Prog than the Regexp, but the computation is much easier to do
420 // on the Regexp. See mimics_pcre.cc for the exact conditions.
421 bool MimicsPCRE();
422
423 // Benchmarking function.
424 void NullWalk();
425
426 // Whether every match of this regexp must be anchored and
427 // begin with a non-empty fixed string (perhaps after ASCII
428 // case-folding). If so, returns the prefix and the sub-regexp that
429 // follows it.
430 bool RequiredPrefix(string* prefix, bool *foldcase, Regexp** suffix);
431
432 private:
433 // Constructor allocates vectors as appropriate for operator.
434 explicit Regexp(RegexpOp op, ParseFlags parse_flags);
435
436 // Use Decref() instead of delete to release Regexps.
437 // This is private to catch deletes at compile time.
438 ~Regexp();
439 void Destroy();
440 bool QuickDestroy();
441
442 // Helpers for Parse. Listed here so they can edit Regexps.
443 class ParseState;
444 friend class ParseState;
445 friend bool ParseCharClass(StringPiece* s, Regexp** out_re,
446 RegexpStatus* status);
447
448 // Helper for testing [sic].
449 friend bool RegexpEqualTestingOnly(Regexp*, Regexp*);
450
451 // Computes whether Regexp is already simple.
452 bool ComputeSimple();
453
454 // Constructor that generates a concatenation or alternation,
455 // enforcing the limit on the number of subexpressions for
456 // a particular Regexp.
457 static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs,
458 ParseFlags flags, bool can_factor);
459
460 // Returns the leading string that re starts with.
461 // The returned Rune* points into a piece of re,
462 // so it must not be used after the caller calls re->Decref().
463 static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags);
464
465 // Removes the first n leading runes from the beginning of re.
466 // Edits re in place.
467 static void RemoveLeadingString(Regexp* re, int n);
468
469 // Returns the leading regexp in re's top-level concatenation.
470 // The returned Regexp* points at re or a sub-expression of re,
471 // so it must not be used after the caller calls re->Decref().
472 static Regexp* LeadingRegexp(Regexp* re);
473
474 // Removes LeadingRegexp(re) from re and returns the remainder.
475 // Might edit re in place.
476 static Regexp* RemoveLeadingRegexp(Regexp* re);
477
478 // Simplifies an alternation of literal strings by factoring out
479 // common prefixes.
480 static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
481 static int FactorAlternationRecursive(Regexp** sub, int nsub,
482 ParseFlags flags, int maxdepth);
483
484 // Is a == b? Only efficient on regexps that have not been through
485 // Simplify yet - the expansion of a kRegexpRepeat will make this
486 // take a long time. Do not call on such regexps, hence private.
487 static bool Equal(Regexp* a, Regexp* b);
488
489 // Allocate space for n sub-regexps.
490 void AllocSub(int n) {
491 if (n < 0 || static_cast<uint16>(n) != n)
492 LOG(FATAL) << "Cannot AllocSub " << n;
493 if (n > 1)
494 submany_ = new Regexp*[n];
495 nsub_ = n;
496 }
497
498 // Add Rune to LiteralString
499 void AddRuneToString(Rune r);
500
501 // Swaps this with that, in place.
502 void Swap(Regexp *that);
503
504 // Operator. See description of operators above.
505 // uint8 instead of RegexpOp to control space usage.
506 uint8 op_;
507
508 // Is this regexp structure already simple
509 // (has it been returned by Simplify)?
510 // uint8 instead of bool to control space usage.
511 uint8 simple_;
512
513 // Flags saved from parsing and used during execution.
514 // (Only FoldCase is used.)
515 // uint16 instead of ParseFlags to control space usage.
516 uint16 parse_flags_;
517
518 // Reference count. Exists so that SimplifyRegexp can build
519 // regexp structures that are dags rather than trees to avoid
520 // exponential blowup in space requirements.
521 // uint16 to control space usage.
522 // The standard regexp routines will never generate a
523 // ref greater than the maximum repeat count (100),
524 // but even so, Incref and Decref consult an overflow map
525 // when ref_ reaches kMaxRef.
526 uint16 ref_;
527 static const uint16 kMaxRef = 0xffff;
528
529 // Subexpressions.
530 // uint16 to control space usage.
531 // Concat and Alternate handle larger numbers of subexpressions
532 // by building concatenation or alternation trees.
533 // Other routines should call Concat or Alternate instead of
534 // filling in sub() by hand.
535 uint16 nsub_;
536 static const uint16 kMaxNsub = 0xffff;
537 union {
538 Regexp** submany_; // if nsub_ > 1
539 Regexp* subone_; // if nsub_ == 1
540 };
541
542 // Extra space for parse and teardown stacks.
543 Regexp* down_;
544
545 // Arguments to operator. See description of operators above.
546 union {
547 struct { // Repeat
548 int max_;
549 int min_;
550 };
551 struct { // Capture
552 int cap_;
553 string* name_;
554 };
555 struct { // LiteralString
556 int nrunes_;
557 Rune* runes_;
558 };
559 struct { // CharClass
560 // These two could be in separate union members,
561 // but it wouldn't save any space (there are other two-word structs)
562 // and keeping them separate avoids confusion during parsing.
563 CharClass* cc_;
564 CharClassBuilder* ccb_;
565 };
566 Rune rune_; // Literal
567 int match_id_; // HaveMatch
568 void *the_union_[2]; // as big as any other element, for memset
569 };
570
571 DISALLOW_EVIL_CONSTRUCTORS(Regexp);
572};
573
574// Character class set: contains non-overlapping, non-abutting RuneRanges.
575typedef set<RuneRange, RuneRangeLess> RuneRangeSet;
576
577class CharClassBuilder {
578 public:
579 CharClassBuilder();
580
581 typedef RuneRangeSet::iterator iterator;
582 iterator begin() { return ranges_.begin(); }
583 iterator end() { return ranges_.end(); }
584
585 int size() { return nrunes_; }
586 bool empty() { return nrunes_ == 0; }
587 bool full() { return nrunes_ == Runemax+1; }
588
589 bool Contains(Rune r);
590 bool FoldsASCII();
591 bool AddRange(Rune lo, Rune hi); // returns whether class changed
592 CharClassBuilder* Copy();
593 void AddCharClass(CharClassBuilder* cc);
594 void Negate();
595 void RemoveAbove(Rune r);
596 CharClass* GetCharClass();
597 void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
598
599 private:
600 static const uint32 AlphaMask = (1<<26) - 1;
601 uint32 upper_; // bitmap of A-Z
602 uint32 lower_; // bitmap of a-z
603 int nrunes_;
604 RuneRangeSet ranges_;
605 DISALLOW_EVIL_CONSTRUCTORS(CharClassBuilder);
606};
607
608// Tell g++ that bitwise ops on ParseFlags produce ParseFlags.
609inline Regexp::ParseFlags operator|(Regexp::ParseFlags a, Regexp::ParseFlags b)
610{
611 return static_cast<Regexp::ParseFlags>(static_cast<int>(a) | static_cast<int>(b));
612}
613
614inline Regexp::ParseFlags operator^(Regexp::ParseFlags a, Regexp::ParseFlags b)
615{
616 return static_cast<Regexp::ParseFlags>(static_cast<int>(a) ^ static_cast<int>(b));
617}
618
619inline Regexp::ParseFlags operator&(Regexp::ParseFlags a, Regexp::ParseFlags b)
620{
621 return static_cast<Regexp::ParseFlags>(static_cast<int>(a) & static_cast<int>(b));
622}
623
624inline Regexp::ParseFlags operator~(Regexp::ParseFlags a)
625{
626 return static_cast<Regexp::ParseFlags>(~static_cast<int>(a));
627}
628
629
630
631} // namespace re2
632
633#endif // RE2_REGEXP_H__