Blame - re2/re2.h - fp2-dev/platform/external/regex-re2

blob: 272028b94feecc0429d86417f07375cbaf7ffbac [file] [log] [blame]

Ian Hodson	2ee91b4	2012-05-14 12:29:36 +0100	[diff] [blame]	1	// Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
				2	// Use of this source code is governed by a BSD-style
				3	// license that can be found in the LICENSE file.
				4
				5	#ifndef RE2_RE2_H
				6	#define RE2_RE2_H
				7
				8	// C++ interface to the re2 regular-expression library.
				9	// RE2 supports Perl-style regular expressions (with extensions like
				10	// \d, \w, \s, ...).
				11	//
				12	// -----------------------------------------------------------------------
				13	// REGEXP SYNTAX:
				14	//
				15	// This module uses the re2 library and hence supports
				16	// its syntax for regular expressions, which is similar to Perl's with
				17	// some of the more complicated things thrown away. In particular,
				18	// backreferences and generalized assertions are not available, nor is \Z.
				19	//
				20	// See http://code.google.com/p/re2/wiki/Syntax for the syntax
				21	// supported by RE2, and a comparison with PCRE and PERL regexps.
				22	//
				23	// For those not familiar with Perl's regular expressions,
				24	// here are some examples of the most commonly used extensions:
				25	//
				26	// "hello (\\w+) world" -- \w matches a "word" character
				27	// "version (\\d+)" -- \d matches a digit
				28	// "hello\\s+world" -- \s matches any whitespace character
				29	// "\\b(\\w+)\\b" -- \b matches non-empty string at word boundary
				30	// "(?i)hello" -- (?i) turns on case-insensitive matching
				31	// "/\\(.?)\\/" -- .? matches . minimum no. of times possible
				32	//
				33	// -----------------------------------------------------------------------
				34	// MATCHING INTERFACE:
				35	//
				36	// The "FullMatch" operation checks that supplied text matches a
				37	// supplied pattern exactly.
				38	//
				39	// Example: successful match
				40	// CHECK(RE2::FullMatch("hello", "h.*o"));
				41	//
				42	// Example: unsuccessful match (requires full match):
				43	// CHECK(!RE2::FullMatch("hello", "e"));
				44	//
				45	// -----------------------------------------------------------------------
				46	// UTF-8 AND THE MATCHING INTERFACE:
				47	//
				48	// By default, the pattern and input text are interpreted as UTF-8.
				49	// The RE2::Latin1 option causes them to be interpreted as Latin-1.
				50	//
				51	// Example:
				52	// CHECK(RE2::FullMatch(utf8_string, RE2(utf8_pattern)));
				53	// CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1)));
				54	//
				55	// -----------------------------------------------------------------------
				56	// MATCHING WITH SUB-STRING EXTRACTION:
				57	//
				58	// You can supply extra pointer arguments to extract matched subpieces.
				59	//
				60	// Example: extracts "ruby" into "s" and 1234 into "i"
				61	// int i;
				62	// string s;
				63	// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
				64	//
				65	// Example: fails because string cannot be stored in integer
				66	// CHECK(!RE2::FullMatch("ruby", "(.*)", &i));
				67	//
				68	// Example: fails because there aren't enough sub-patterns:
				69	// CHECK(!RE2::FullMatch("ruby:1234", "\\w+:\\d+", &s));
				70	//
				71	// Example: does not try to extract any extra sub-patterns
				72	// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s));
				73	//
				74	// Example: does not try to extract into NULL
				75	// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i));
				76	//
				77	// Example: integer overflow causes failure
				78	// CHECK(!RE2::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i));
				79	//
				80	// NOTE(rsc): Asking for substrings slows successful matches quite a bit.
				81	// This may get a little faster in the future, but right now is slower
				82	// than PCRE. On the other hand, failed matches run very fast (faster
				83	// than PCRE), as do matches without substring extraction.
				84	//
				85	// -----------------------------------------------------------------------
				86	// PARTIAL MATCHES
				87	//
				88	// You can use the "PartialMatch" operation when you want the pattern
				89	// to match any substring of the text.
				90	//
				91	// Example: simple search for a string:
				92	// CHECK(RE2::PartialMatch("hello", "ell"));
				93	//
				94	// Example: find first number in a string
				95	// int number;
				96	// CHECK(RE2::PartialMatch("x*100 + 20", "(\\d+)", &number));
				97	// CHECK_EQ(number, 100);
				98	//
				99	// -----------------------------------------------------------------------
				100	// PRE-COMPILED REGULAR EXPRESSIONS
				101	//
				102	// RE2 makes it easy to use any string as a regular expression, without
				103	// requiring a separate compilation step.
				104	//
				105	// If speed is of the essence, you can create a pre-compiled "RE2"
				106	// object from the pattern and use it multiple times. If you do so,
				107	// you can typically parse text faster than with sscanf.
				108	//
				109	// Example: precompile pattern for faster matching:
				110	// RE2 pattern("h.*o");
				111	// while (ReadLine(&str)) {
				112	// if (RE2::FullMatch(str, pattern)) ...;
				113	// }
				114	//
				115	// -----------------------------------------------------------------------
				116	// SCANNING TEXT INCREMENTALLY
				117	//
				118	// The "Consume" operation may be useful if you want to repeatedly
				119	// match regular expressions at the front of a string and skip over
				120	// them as they match. This requires use of the "StringPiece" type,
				121	// which represents a sub-range of a real string.
				122	//
				123	// Example: read lines of the form "var = value" from a string.
				124	// string contents = ...; // Fill string somehow
				125	// StringPiece input(contents); // Wrap a StringPiece around it
				126	//
				127	// string var;
				128	// int value;
				129	// while (RE2::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) {
				130	// ...;
				131	// }
				132	//
				133	// Each successful call to "Consume" will set "var/value", and also
				134	// advance "input" so it points past the matched text. Note that if the
				135	// regular expression matches an empty string, input will advance
				136	// by 0 bytes. If the regular expression being used might match
				137	// an empty string, the loop body must check for this case and either
				138	// advance the string or break out of the loop.
				139	//
				140	// The "FindAndConsume" operation is similar to "Consume" but does not
				141	// anchor your match at the beginning of the string. For example, you
				142	// could extract all words from a string by repeatedly calling
				143	// RE2::FindAndConsume(&input, "(\\w+)", &word)
				144	//
				145	// -----------------------------------------------------------------------
				146	// USING VARIABLE NUMBER OF ARGUMENTS
				147	//
				148	// The above operations require you to know the number of arguments
				149	// when you write the code. This is not always possible or easy (for
				150	// example, the regular expression may be calculated at run time).
				151	// You can use the "N" version of the operations when the number of
				152	// match arguments are determined at run time.
				153	//
				154	// Example:
				155	// const RE2::Arg* args[10];
				156	// int n;
				157	// // ... populate args with pointers to RE2::Arg values ...
				158	// // ... set n to the number of RE2::Arg objects ...
				159	// bool match = RE2::FullMatchN(input, pattern, args, n);
				160	//
				161	// The last statement is equivalent to
				162	//
				163	// bool match = RE2::FullMatch(input, pattern,
				164	// args[0], args[1], ..., *args[n - 1]);
				165	//
				166	// -----------------------------------------------------------------------
				167	// PARSING HEX/OCTAL/C-RADIX NUMBERS
				168	//
				169	// By default, if you pass a pointer to a numeric value, the
				170	// corresponding text is interpreted as a base-10 number. You can
				171	// instead wrap the pointer with a call to one of the operators Hex(),
				172	// Octal(), or CRadix() to interpret the text in another base. The
				173	// CRadix operator interprets C-style "0" (base-8) and "0x" (base-16)
				174	// prefixes, but defaults to base-10.
				175	//
				176	// Example:
				177	// int a, b, c, d;
				178	// CHECK(RE2::FullMatch("100 40 0100 0x40", "(.) (.) (.) (.)",
				179	// RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d));
				180	// will leave 64 in a, b, c, and d.
				181
				182
				183	#include <stdint.h>
				184	#include <map>
				185	#include <string>
				186	#include "re2/stringpiece.h"
				187	#include "re2/variadic_function.h"
				188
				189	namespace re2 {
Alexander Gutkin	0d4c523	2013-02-28 13:47:27 +0000	[diff] [blame]	190
Ian Hodson	2ee91b4	2012-05-14 12:29:36 +0100	[diff] [blame]	191	using std::string;
				192	using std::map;
				193	class Mutex;
				194	class Prog;
				195	class Regexp;
				196
Alexander Gutkin	0d4c523	2013-02-28 13:47:27 +0000	[diff] [blame]	197	// The following enum should be used only as a constructor argument to indicate
				198	// that the variable has static storage class, and that the constructor should
				199	// do nothing to its state. It indicates to the reader that it is legal to
				200	// declare a static instance of the class, provided the constructor is given
				201	// the LINKER_INITIALIZED argument. Normally, it is unsafe to declare a
				202	// static variable that has a constructor or a destructor because invocation
				203	// order is undefined. However, IF the type can be initialized by filling with
				204	// zeroes (which the loader does for static variables), AND the type's
				205	// destructor does nothing to the storage, then a constructor for static
				206	// initialization can be declared as
				207	// explicit MyClass(LinkerInitialized x) {}
				208	// and invoked as
				209	// static MyClass my_variable_name(LINKER_INITIALIZED);
				210	enum LinkerInitialized { LINKER_INITIALIZED };
				211
Ian Hodson	2ee91b4	2012-05-14 12:29:36 +0100	[diff] [blame]	212	// Interface for regular expression matching. Also corresponds to a
				213	// pre-compiled regular expression. An "RE2" object is safe for
				214	// concurrent use by multiple threads.
				215	class RE2 {
				216	public:
				217	// We convert user-passed pointers into special Arg objects
				218	class Arg;
				219	class Options;
				220
				221	// Defined in set.h.
				222	class Set;
				223
				224	enum ErrorCode {
				225	NoError = 0,
				226
				227	// Unexpected error
				228	ErrorInternal,
				229
				230	// Parse errors
				231	ErrorBadEscape, // bad escape sequence
				232	ErrorBadCharClass, // bad character class
				233	ErrorBadCharRange, // bad character class range
				234	ErrorMissingBracket, // missing closing ]
				235	ErrorMissingParen, // missing closing )
				236	ErrorTrailingBackslash, // trailing \ at end of regexp
				237	ErrorRepeatArgument, // repeat argument missing, e.g. "*"
				238	ErrorRepeatSize, // bad repetition argument
				239	ErrorRepeatOp, // bad repetition operator
				240	ErrorBadPerlOp, // bad perl operator
				241	ErrorBadUTF8, // invalid UTF-8 in regexp
				242	ErrorBadNamedCapture, // bad named capture group
				243	ErrorPatternTooLarge, // pattern too large (compile failed)
				244	};
				245
				246	// Predefined common options.
				247	// If you need more complicated things, instantiate
Alexander Gutkin	0d4c523	2013-02-28 13:47:27 +0000	[diff] [blame]	248	// an Option class, possibly passing one of these to
				249	// the Option constructor, change the settings, and pass that
				250	// Option class to the RE2 constructor.
				251	enum CannedOptions {
				252	DefaultOptions = 0,
				253	Latin1, // treat input as Latin-1 (default UTF-8)
				254	POSIX, // POSIX syntax, leftmost-longest match
				255	Quiet // do not log about regexp parse errors
				256	};
Ian Hodson	2ee91b4	2012-05-14 12:29:36 +0100	[diff] [blame]	257
				258	// Need to have the const char* and const string& forms for implicit
				259	// conversions when passing string literals to FullMatch and PartialMatch.
				260	// Otherwise the StringPiece form would be sufficient.
				261	#ifndef SWIG
				262	RE2(const char* pattern);
				263	RE2(const string& pattern);
				264	#endif
				265	RE2(const StringPiece& pattern);
				266	RE2(const StringPiece& pattern, const Options& option);
				267	~RE2();
				268
				269	// Returns whether RE2 was created properly.
				270	bool ok() const { return error_code() == NoError; }
				271
				272	// The string specification for this RE2. E.g.
				273	// RE2 re("ab*c?d+");
				274	// re.pattern(); // "ab*c?d+"
				275	const string& pattern() const { return pattern_; }
				276
				277	// If RE2 could not be created properly, returns an error string.
				278	// Else returns the empty string.
				279	const string& error() const { return *error_; }
				280
				281	// If RE2 could not be created properly, returns an error code.
				282	// Else returns RE2::NoError (== 0).
				283	ErrorCode error_code() const { return error_code_; }
				284
				285	// If RE2 could not be created properly, returns the offending
				286	// portion of the regexp.
				287	const string& error_arg() const { return error_arg_; }
				288
				289	// Returns the program size, a very approximate measure of a regexp's "cost".
				290	// Larger numbers are more expensive than smaller numbers.
				291	int ProgramSize() const;
				292
				293	// Returns the underlying Regexp; not for general use.
				294	// Returns entire_regexp_ so that callers don't need
				295	// to know about prefix_ and prefix_foldcase_.
				296	re2::Regexp* Regexp() const { return entire_regexp_; }
				297
				298	/*** The useful part: the matching interface ***/
				299
				300	// Matches "text" against "pattern". If pointer arguments are
				301	// supplied, copies matched sub-patterns into them.
				302	//
				303	// You can pass in a "const char*" or a "string" for "text".
				304	// You can pass in a "const char*" or a "string" or a "RE2" for "pattern".
				305	//
				306	// The provided pointer arguments can be pointers to any scalar numeric
				307	// type, or one of:
				308	// string (matched piece is copied to string)
				309	// StringPiece (StringPiece is mutated to point to matched piece)
				310	// T (where "bool T::ParseFrom(const char*, int)" exists)
				311	// (void*)NULL (the corresponding matched sub-pattern is not copied)
				312	//
				313	// Returns true iff all of the following conditions are satisfied:
				314	// a. "text" matches "pattern" exactly
				315	// b. The number of matched sub-patterns is >= number of supplied pointers
				316	// c. The "i"th argument has a suitable type for holding the
				317	// string captured as the "i"th sub-pattern. If you pass in
				318	// NULL for the "i"th argument, or pass fewer arguments than
				319	// number of sub-patterns, "i"th captured sub-pattern is
				320	// ignored.
				321	//
				322	// CAVEAT: An optional sub-pattern that does not exist in the
				323	// matched string is assigned the empty string. Therefore, the
				324	// following will return false (because the empty string is not a
				325	// valid number):
				326	// int number;
				327	// RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number);
				328	static bool FullMatchN(const StringPiece& text, const RE2& re,
				329	const Arg* const args[], int argc);
				330	static const VariadicFunction2<
				331	bool, const StringPiece&, const RE2&, Arg, RE2::FullMatchN> FullMatch;
				332
				333	// Exactly like FullMatch(), except that "pattern" is allowed to match
				334	// a substring of "text".
				335	static bool PartialMatchN(const StringPiece& text, const RE2& re, // 3..16 args
				336	const Arg* const args[], int argc);
				337	static const VariadicFunction2<
				338	bool, const StringPiece&, const RE2&, Arg, RE2::PartialMatchN> PartialMatch;
				339
				340	// Like FullMatch() and PartialMatch(), except that pattern has to
				341	// match a prefix of "text", and "input" is advanced past the matched
				342	// text. Note: "input" is modified iff this routine returns true.
				343	static bool ConsumeN(StringPiece* input, const RE2& pattern, // 3..16 args
				344	const Arg* const args[], int argc);
				345	static const VariadicFunction2<
				346	bool, StringPiece*, const RE2&, Arg, RE2::ConsumeN> Consume;
				347
				348	// Like Consume(..), but does not anchor the match at the beginning of the
				349	// string. That is, "pattern" need not start its match at the beginning of
				350	// "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next
				351	// word in "s" and stores it in "word".
				352	static bool FindAndConsumeN(StringPiece* input, const RE2& pattern,
				353	const Arg* const args[], int argc);
				354	static const VariadicFunction2<
				355	bool, StringPiece*, const RE2&, Arg, RE2::FindAndConsumeN> FindAndConsume;
				356
				357	// Replace the first match of "pattern" in "str" with "rewrite".
				358	// Within "rewrite", backslash-escaped digits (\1 to \9) can be
				359	// used to insert text matching corresponding parenthesized group
				360	// from the pattern. \0 in "rewrite" refers to the entire matching
				361	// text. E.g.,
				362	//
				363	// string s = "yabba dabba doo";
				364	// CHECK(RE2::Replace(&s, "b+", "d"));
				365	//
				366	// will leave "s" containing "yada dabba doo"
				367	//
				368	// Returns true if the pattern matches and a replacement occurs,
				369	// false otherwise.
				370	static bool Replace(string *str,
				371	const RE2& pattern,
				372	const StringPiece& rewrite);
				373
				374	// Like Replace(), except replaces successive non-overlapping occurrences
				375	// of the pattern in the string with the rewrite. E.g.
				376	//
				377	// string s = "yabba dabba doo";
				378	// CHECK(RE2::GlobalReplace(&s, "b+", "d"));
				379	//
				380	// will leave "s" containing "yada dada doo"
				381	// Replacements are not subject to re-matching.
				382	//
				383	// Because GlobalReplace only replaces non-overlapping matches,
				384	// replacing "ana" within "banana" makes only one replacement, not two.
				385	//
				386	// Returns the number of replacements made.
				387	static int GlobalReplace(string *str,
				388	const RE2& pattern,
				389	const StringPiece& rewrite);
				390
				391	// Like Replace, except that if the pattern matches, "rewrite"
				392	// is copied into "out" with substitutions. The non-matching
				393	// portions of "text" are ignored.
				394	//
				395	// Returns true iff a match occurred and the extraction happened
				396	// successfully; if no match occurs, the string is left unaffected.
				397	static bool Extract(const StringPiece &text,
				398	const RE2& pattern,
				399	const StringPiece &rewrite,
				400	string *out);
				401
				402	// Escapes all potentially meaningful regexp characters in
				403	// 'unquoted'. The returned string, used as a regular expression,
				404	// will exactly match the original string. For example,
				405	// 1.5-2.0?
				406	// may become:
				407	// 1\.5\-2\.0\?
				408	static string QuoteMeta(const StringPiece& unquoted);
				409
				410	// Computes range for any strings matching regexp. The min and max can in
				411	// some cases be arbitrarily precise, so the caller gets to specify the
				412	// maximum desired length of string returned.
				413	//
				414	// Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
				415	// string s that is an anchored match for this regexp satisfies
				416	// min <= s && s <= max.
				417	//
				418	// Note that PossibleMatchRange() will only consider the first copy of an
				419	// infinitely repeated element (i.e., any regexp element followed by a '*' or
				420	// '+' operator). Regexps with "{N}" constructions are not affected, as those
				421	// do not compile down to infinite repetitions.
				422	//
				423	// Returns true on success, false on error.
				424	bool PossibleMatchRange(string* min, string* max, int maxlen) const;
				425
				426	// Generic matching interface
				427
				428	// Type of match.
				429	enum Anchor {
				430	UNANCHORED, // No anchoring
				431	ANCHOR_START, // Anchor at start only
				432	ANCHOR_BOTH, // Anchor at start and end
				433	};
				434
				435	// Return the number of capturing subpatterns, or -1 if the
				436	// regexp wasn't valid on construction. The overall match ($0)
				437	// does not count: if the regexp is "(a)(b)", returns 2.
				438	int NumberOfCapturingGroups() const;
				439
				440
				441	// Return a map from names to capturing indices.
				442	// The map records the index of the leftmost group
				443	// with the given name.
				444	// Only valid until the re is deleted.
				445	const map<string, int>& NamedCapturingGroups() const;
				446
				447	// Return a map from capturing indices to names.
				448	// The map has no entries for unnamed groups.
				449	// Only valid until the re is deleted.
				450	const map<int, string>& CapturingGroupNames() const;
				451
				452	// General matching routine.
				453	// Match against text starting at offset startpos
				454	// and stopping the search at offset endpos.
				455	// Returns true if match found, false if not.
				456	// On a successful match, fills in match[] (up to nmatch entries)
				457	// with information about submatches.
				458	// I.e. matching RE2("(foo)\|(bar)baz") on "barbazbla" will return true,
				459	// setting match[0] = "barbaz", match[1] = NULL, match[2] = "bar",
				460	// match[3] = NULL, ..., up to match[nmatch-1] = NULL.
				461	//
				462	// Don't ask for more match information than you will use:
				463	// runs much faster with nmatch == 1 than nmatch > 1, and
				464	// runs even faster if nmatch == 0.
				465	// Doesn't make sense to use nmatch > 1 + NumberOfCapturingGroups(),
				466	// but will be handled correctly.
				467	//
				468	// Passing text == StringPiece(NULL, 0) will be handled like any other
				469	// empty string, but note that on return, it will not be possible to tell
				470	// whether submatch i matched the empty string or did not match:
				471	// either way, match[i] == NULL.
				472	bool Match(const StringPiece& text,
				473	int startpos,
				474	int endpos,
				475	Anchor anchor,
				476	StringPiece *match,
				477	int nmatch) const;
				478
				479	// Check that the given rewrite string is suitable for use with this
				480	// regular expression. It checks that:
				481	// * The regular expression has enough parenthesized subexpressions
				482	// to satisfy all of the \N tokens in rewrite
				483	// * The rewrite string doesn't have any syntax errors. E.g.,
				484	// '\' followed by anything other than a digit or '\'.
				485	// A true return value guarantees that Replace() and Extract() won't
				486	// fail because of a bad rewrite string.
				487	bool CheckRewriteString(const StringPiece& rewrite, string* error) const;
				488
Alexander Gutkin	0d4c523	2013-02-28 13:47:27 +0000	[diff] [blame]	489	// Returns the maximum submatch needed for the rewrite to be done by
				490	// Replace(). E.g. if rewrite == "foo \\2,\\1", returns 2.
				491	static int MaxSubmatch(const StringPiece& rewrite);
				492
				493	// Append the "rewrite" string, with backslash subsitutions from "vec",
				494	// to string "out".
				495	// Returns true on success. This method can fail because of a malformed
				496	// rewrite string. CheckRewriteString guarantees that the rewrite will
				497	// be sucessful.
				498	bool Rewrite(string *out,
				499	const StringPiece &rewrite,
				500	const StringPiece* vec,
				501	int veclen) const;
				502
Ian Hodson	2ee91b4	2012-05-14 12:29:36 +0100	[diff] [blame]	503	// Constructor options
				504	class Options {
				505	public:
				506	// The options are (defaults in parentheses):
				507	//
				508	// utf8 (true) text and pattern are UTF-8; otherwise Latin-1
				509	// posix_syntax (false) restrict regexps to POSIX egrep syntax
				510	// longest_match (false) search for longest match, not first match
				511	// log_errors (true) log syntax and execution errors to ERROR
				512	// max_mem (see below) approx. max memory footprint of RE2
				513	// literal (false) interpret string as literal, not regexp
				514	// never_nl (false) never match \n, even if it is in regexp
Alexander Gutkin	0d4c523	2013-02-28 13:47:27 +0000	[diff] [blame]	515	// never_capture (false) parse all parens as non-capturing
Ian Hodson	2ee91b4	2012-05-14 12:29:36 +0100	[diff] [blame]	516	// case_sensitive (true) match is case-sensitive (regexp can override
				517	// with (?i) unless in posix_syntax mode)
				518	//
				519	// The following options are only consulted when posix_syntax == true.
				520	// (When posix_syntax == false these features are always enabled and
				521	// cannot be turned off.)
				522	// perl_classes (false) allow Perl's \d \s \w \D \S \W
				523	// word_boundary (false) allow Perl's \b \B (word boundary and not)
				524	// one_line (false) ^ and $ only match beginning and end of text
				525	//
				526	// The max_mem option controls how much memory can be used
				527	// to hold the compiled form of the regexp (the Prog) and
				528	// its cached DFA graphs. Code Search placed limits on the number
				529	// of Prog instructions and DFA states: 10,000 for both.
				530	// In RE2, those limits would translate to about 240 KB per Prog
				531	// and perhaps 2.5 MB per DFA (DFA state sizes vary by regexp; RE2 does a
				532	// better job of keeping them small than Code Search did).
				533	// Each RE2 has two Progs (one forward, one reverse), and each Prog
				534	// can have two DFAs (one first match, one longest match).
				535	// That makes 4 DFAs:
				536	//
				537	// forward, first-match - used for UNANCHORED or ANCHOR_LEFT searches
				538	// if opt.longest_match() == false
				539	// forward, longest-match - used for all ANCHOR_BOTH searches,
				540	// and the other two kinds if
				541	// opt.longest_match() == true
				542	// reverse, first-match - never used
				543	// reverse, longest-match - used as second phase for unanchored searches
				544	//
				545	// The RE2 memory budget is statically divided between the two
				546	// Progs and then the DFAs: two thirds to the forward Prog
				547	// and one third to the reverse Prog. The forward Prog gives half
				548	// of what it has left over to each of its DFAs. The reverse Prog
				549	// gives it all to its longest-match DFA.
				550	//
				551	// Once a DFA fills its budget, it flushes its cache and starts over.
				552	// If this happens too often, RE2 falls back on the NFA implementation.
				553
				554	// For now, make the default budget something close to Code Search.
				555	static const int kDefaultMaxMem = 8<<20;
				556
				557	enum Encoding {
				558	EncodingUTF8 = 1,
				559	EncodingLatin1
				560	};
				561
				562	Options() :
				563	encoding_(EncodingUTF8),
				564	posix_syntax_(false),
				565	longest_match_(false),
				566	log_errors_(true),
				567	max_mem_(kDefaultMaxMem),
				568	literal_(false),
				569	never_nl_(false),
Alexander Gutkin	0d4c523	2013-02-28 13:47:27 +0000	[diff] [blame]	570	never_capture_(false),
Ian Hodson	2ee91b4	2012-05-14 12:29:36 +0100	[diff] [blame]	571	case_sensitive_(true),
				572	perl_classes_(false),
				573	word_boundary_(false),
				574	one_line_(false) {
				575	}
Alexander Gutkin	0d4c523	2013-02-28 13:47:27 +0000	[diff] [blame]	576
				577	/implicit/ Options(CannedOptions);
Ian Hodson	2ee91b4	2012-05-14 12:29:36 +0100	[diff] [blame]	578
				579	Encoding encoding() const { return encoding_; }
				580	void set_encoding(Encoding encoding) { encoding_ = encoding; }
				581
				582	// Legacy interface to encoding.
				583	// TODO(rsc): Remove once clients have been converted.
				584	bool utf8() const { return encoding_ == EncodingUTF8; }
				585	void set_utf8(bool b) {
				586	if (b) {
				587	encoding_ = EncodingUTF8;
				588	} else {
				589	encoding_ = EncodingLatin1;
				590	}
				591	}
				592
				593	bool posix_syntax() const { return posix_syntax_; }
				594	void set_posix_syntax(bool b) { posix_syntax_ = b; }
				595
				596	bool longest_match() const { return longest_match_; }
				597	void set_longest_match(bool b) { longest_match_ = b; }
				598
				599	bool log_errors() const { return log_errors_; }
				600	void set_log_errors(bool b) { log_errors_ = b; }
				601
				602	int max_mem() const { return max_mem_; }
				603	void set_max_mem(int m) { max_mem_ = m; }
				604
				605	bool literal() const { return literal_; }
				606	void set_literal(bool b) { literal_ = b; }
				607
				608	bool never_nl() const { return never_nl_; }
				609	void set_never_nl(bool b) { never_nl_ = b; }
				610
Alexander Gutkin	0d4c523	2013-02-28 13:47:27 +0000	[diff] [blame]	611	bool never_capture() const { return never_capture_; }
				612	void set_never_capture(bool b) { never_capture_ = b; }
				613
Ian Hodson	2ee91b4	2012-05-14 12:29:36 +0100	[diff] [blame]	614	bool case_sensitive() const { return case_sensitive_; }
				615	void set_case_sensitive(bool b) { case_sensitive_ = b; }
				616
				617	bool perl_classes() const { return perl_classes_; }
				618	void set_perl_classes(bool b) { perl_classes_ = b; }
				619
				620	bool word_boundary() const { return word_boundary_; }
				621	void set_word_boundary(bool b) { word_boundary_ = b; }
				622
				623	bool one_line() const { return one_line_; }
				624	void set_one_line(bool b) { one_line_ = b; }
				625
				626	void Copy(const Options& src) {
				627	encoding_ = src.encoding_;
				628	posix_syntax_ = src.posix_syntax_;
				629	longest_match_ = src.longest_match_;
				630	log_errors_ = src.log_errors_;
				631	max_mem_ = src.max_mem_;
				632	literal_ = src.literal_;
				633	never_nl_ = src.never_nl_;
Alexander Gutkin	0d4c523	2013-02-28 13:47:27 +0000	[diff] [blame]	634	never_capture_ = src.never_capture_;
Ian Hodson	2ee91b4	2012-05-14 12:29:36 +0100	[diff] [blame]	635	case_sensitive_ = src.case_sensitive_;
				636	perl_classes_ = src.perl_classes_;
				637	word_boundary_ = src.word_boundary_;
				638	one_line_ = src.one_line_;
				639	}
				640
				641	int ParseFlags() const;
				642
				643	private:
Ian Hodson	2ee91b4	2012-05-14 12:29:36 +0100	[diff] [blame]	644	Encoding encoding_;
				645	bool posix_syntax_;
				646	bool longest_match_;
				647	bool log_errors_;
				648	int64_t max_mem_;
				649	bool literal_;
				650	bool never_nl_;
Alexander Gutkin	0d4c523	2013-02-28 13:47:27 +0000	[diff] [blame]	651	bool never_capture_;
Ian Hodson	2ee91b4	2012-05-14 12:29:36 +0100	[diff] [blame]	652	bool case_sensitive_;
				653	bool perl_classes_;
				654	bool word_boundary_;
				655	bool one_line_;
				656
				657	//DISALLOW_EVIL_CONSTRUCTORS(Options);
				658	Options(const Options&);
				659	void operator=(const Options&);
				660	};
				661
				662	// Returns the options set in the constructor.
				663	const Options& options() const { return options_; };
				664
				665	// Argument converters; see below.
				666	static inline Arg CRadix(short* x);
				667	static inline Arg CRadix(unsigned short* x);
				668	static inline Arg CRadix(int* x);
				669	static inline Arg CRadix(unsigned int* x);
				670	static inline Arg CRadix(long* x);
				671	static inline Arg CRadix(unsigned long* x);
				672	static inline Arg CRadix(long long* x);
				673	static inline Arg CRadix(unsigned long long* x);
				674
				675	static inline Arg Hex(short* x);
				676	static inline Arg Hex(unsigned short* x);
				677	static inline Arg Hex(int* x);
				678	static inline Arg Hex(unsigned int* x);
				679	static inline Arg Hex(long* x);
				680	static inline Arg Hex(unsigned long* x);
				681	static inline Arg Hex(long long* x);
				682	static inline Arg Hex(unsigned long long* x);
				683
				684	static inline Arg Octal(short* x);
				685	static inline Arg Octal(unsigned short* x);
				686	static inline Arg Octal(int* x);
				687	static inline Arg Octal(unsigned int* x);
				688	static inline Arg Octal(long* x);
				689	static inline Arg Octal(unsigned long* x);
				690	static inline Arg Octal(long long* x);
				691	static inline Arg Octal(unsigned long long* x);
				692
				693	private:
				694	void Init(const StringPiece& pattern, const Options& options);
				695
Ian Hodson	2ee91b4	2012-05-14 12:29:36 +0100	[diff] [blame]	696	bool DoMatch(const StringPiece& text,
				697	Anchor anchor,
				698	int* consumed,
				699	const Arg* const args[],
				700	int n) const;
				701
				702	re2::Prog* ReverseProg() const;
				703
				704	mutable Mutex* mutex_;
				705	string pattern_; // string regular expression
				706	Options options_; // option flags
				707	string prefix_; // required prefix (before regexp_)
				708	bool prefix_foldcase_; // prefix is ASCII case-insensitive
				709	re2::Regexp* entire_regexp_; // parsed regular expression
				710	re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed
				711	re2::Prog* prog_; // compiled program for regexp
				712	mutable re2::Prog* rprog_; // reverse program for regexp
				713	bool is_one_pass_; // can use prog_->SearchOnePass?
				714	mutable const string* error_; // Error indicator
				715	// (or points to empty string)
				716	mutable ErrorCode error_code_; // Error code
				717	mutable string error_arg_; // Fragment of regexp showing error
				718	mutable int num_captures_; // Number of capturing groups
				719
				720	// Map from capture names to indices
				721	mutable const map<string, int>* named_groups_;
				722
				723	// Map from capture indices to names
				724	mutable const map<int, string>* group_names_;
				725
				726	//DISALLOW_EVIL_CONSTRUCTORS(RE2);
				727	RE2(const RE2&);
				728	void operator=(const RE2&);
				729	};
				730
				731	/*** Implementation details ***/
				732
				733	// Hex/Octal/Binary?
				734
				735	// Special class for parsing into objects that define a ParseFrom() method
				736	template <class T>
				737	class _RE2_MatchObject {
				738	public:
				739	static inline bool Parse(const char* str, int n, void* dest) {
				740	if (dest == NULL) return true;
				741	T* object = reinterpret_cast<T*>(dest);
				742	return object->ParseFrom(str, n);
				743	}
				744	};
				745
				746	class RE2::Arg {
				747	public:
				748	// Empty constructor so we can declare arrays of RE2::Arg
				749	Arg();
				750
				751	// Constructor specially designed for NULL arguments
				752	Arg(void*);
				753
				754	typedef bool (Parser)(const char str, int n, void* dest);
				755
				756	// Type-specific parsers
				757	#define MAKE_PARSER(type,name) \
				758	Arg(type* p) : arg_(p), parser_(name) { } \
				759	Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \
				760
				761
				762	MAKE_PARSER(char, parse_char);
				763	MAKE_PARSER(signed char, parse_char);
				764	MAKE_PARSER(unsigned char, parse_uchar);
				765	MAKE_PARSER(short, parse_short);
				766	MAKE_PARSER(unsigned short, parse_ushort);
				767	MAKE_PARSER(int, parse_int);
				768	MAKE_PARSER(unsigned int, parse_uint);
				769	MAKE_PARSER(long, parse_long);
				770	MAKE_PARSER(unsigned long, parse_ulong);
				771	MAKE_PARSER(long long, parse_longlong);
				772	MAKE_PARSER(unsigned long long, parse_ulonglong);
				773	MAKE_PARSER(float, parse_float);
				774	MAKE_PARSER(double, parse_double);
				775	MAKE_PARSER(string, parse_string);
				776	MAKE_PARSER(StringPiece, parse_stringpiece);
				777
				778	#undef MAKE_PARSER
				779
				780	// Generic constructor
				781	template <class T> Arg(T*, Parser parser);
				782	// Generic constructor template
				783	template <class T> Arg(T* p)
				784	: arg_(p), parser_(_RE2_MatchObject<T>::Parse) {
				785	}
				786
				787	// Parse the data
				788	bool Parse(const char* str, int n) const;
				789
				790	private:
				791	void* arg_;
				792	Parser parser_;
				793
				794	static bool parse_null (const char* str, int n, void* dest);
				795	static bool parse_char (const char* str, int n, void* dest);
				796	static bool parse_uchar (const char* str, int n, void* dest);
				797	static bool parse_float (const char* str, int n, void* dest);
				798	static bool parse_double (const char* str, int n, void* dest);
				799	static bool parse_string (const char* str, int n, void* dest);
				800	static bool parse_stringpiece (const char* str, int n, void* dest);
				801
				802	#define DECLARE_INTEGER_PARSER(name) \
				803	private: \
				804	static bool parse_ ## name(const char* str, int n, void* dest); \
				805	static bool parse_ ## name ## _radix( \
				806	const char* str, int n, void* dest, int radix); \
				807	public: \
				808	static bool parse_ ## name ## _hex(const char* str, int n, void* dest); \
				809	static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \
				810	static bool parse_ ## name ## _cradix(const char* str, int n, void* dest)
				811
				812	DECLARE_INTEGER_PARSER(short);
				813	DECLARE_INTEGER_PARSER(ushort);
				814	DECLARE_INTEGER_PARSER(int);
				815	DECLARE_INTEGER_PARSER(uint);
				816	DECLARE_INTEGER_PARSER(long);
				817	DECLARE_INTEGER_PARSER(ulong);
				818	DECLARE_INTEGER_PARSER(longlong);
				819	DECLARE_INTEGER_PARSER(ulonglong);
				820
				821	#undef DECLARE_INTEGER_PARSER
				822	};
				823
				824	inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
				825	inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }
				826
				827	inline bool RE2::Arg::Parse(const char* str, int n) const {
				828	return (*parser_)(str, n, arg_);
				829	}
				830
				831	// This part of the parser, appropriate only for ints, deals with bases
				832	#define MAKE_INTEGER_PARSER(type, name) \
				833	inline RE2::Arg RE2::Hex(type* ptr) { \
				834	return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _hex); } \
				835	inline RE2::Arg RE2::Octal(type* ptr) { \
				836	return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _octal); } \
				837	inline RE2::Arg RE2::CRadix(type* ptr) { \
				838	return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _cradix); }
				839
				840	MAKE_INTEGER_PARSER(short, short);
				841	MAKE_INTEGER_PARSER(unsigned short, ushort);
				842	MAKE_INTEGER_PARSER(int, int);
				843	MAKE_INTEGER_PARSER(unsigned int, uint);
				844	MAKE_INTEGER_PARSER(long, long);
				845	MAKE_INTEGER_PARSER(unsigned long, ulong);
				846	MAKE_INTEGER_PARSER(long long, longlong);
				847	MAKE_INTEGER_PARSER(unsigned long long, ulonglong);
				848
				849	#undef MAKE_INTEGER_PARSER
				850
				851	} // namespace re2
				852
				853	using re2::RE2;
				854
				855	#endif /* RE2_RE2_H */