Blame - re2/simplify.cc - fp2-dev/platform/external/regex-re2

blob: faf32084e05f9f171b33fdc076d465e0ec693600 [file] [log] [blame]

Ian Hodson	2ee91b4	2012-05-14 12:29:36 +0100	[diff] [blame]	1	// Copyright 2006 The RE2 Authors. All Rights Reserved.
				2	// Use of this source code is governed by a BSD-style
				3	// license that can be found in the LICENSE file.
				4
				5	// Rewrite POSIX and other features in re
				6	// to use simple extended regular expression features.
				7	// Also sort and simplify character classes.
				8
				9	#include "util/util.h"
				10	#include "re2/regexp.h"
				11	#include "re2/walker-inl.h"
				12
				13	namespace re2 {
				14
				15	// Parses the regexp src and then simplifies it and sets *dst to the
				16	// string representation of the simplified form. Returns true on success.
				17	// Returns false and sets *error (if error != NULL) on error.
				18	bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
				19	string* dst,
				20	RegexpStatus* status) {
				21	Regexp* re = Parse(src, flags, status);
				22	if (re == NULL)
				23	return false;
				24	Regexp* sre = re->Simplify();
				25	re->Decref();
				26	if (sre == NULL) {
				27	// Should not happen, since Simplify never fails.
				28	LOG(ERROR) << "Simplify failed on " << src;
				29	if (status) {
				30	status->set_code(kRegexpInternalError);
				31	status->set_error_arg(src);
				32	}
				33	return false;
				34	}
				35	*dst = sre->ToString();
				36	sre->Decref();
				37	return true;
				38	}
				39
				40	// Assuming the simple_ flags on the children are accurate,
				41	// is this Regexp* simple?
				42	bool Regexp::ComputeSimple() {
				43	Regexp** subs;
				44	switch (op_) {
				45	case kRegexpNoMatch:
				46	case kRegexpEmptyMatch:
				47	case kRegexpLiteral:
				48	case kRegexpLiteralString:
				49	case kRegexpBeginLine:
				50	case kRegexpEndLine:
				51	case kRegexpBeginText:
				52	case kRegexpWordBoundary:
				53	case kRegexpNoWordBoundary:
				54	case kRegexpEndText:
				55	case kRegexpAnyChar:
				56	case kRegexpAnyByte:
				57	case kRegexpHaveMatch:
				58	return true;
				59	case kRegexpConcat:
				60	case kRegexpAlternate:
				61	// These are simple as long as the subpieces are simple.
				62	subs = sub();
				63	for (int i = 0; i < nsub_; i++)
				64	if (!subs[i]->simple_)
				65	return false;
				66	return true;
				67	case kRegexpCharClass:
				68	// Simple as long as the char class is not empty, not full.
				69	if (ccb_ != NULL)
				70	return !ccb_->empty() && !ccb_->full();
				71	return !cc_->empty() && !cc_->full();
				72	case kRegexpCapture:
				73	subs = sub();
				74	return subs[0]->simple_;
				75	case kRegexpStar:
				76	case kRegexpPlus:
				77	case kRegexpQuest:
				78	subs = sub();
				79	if (!subs[0]->simple_)
				80	return false;
				81	switch (subs[0]->op_) {
				82	case kRegexpStar:
				83	case kRegexpPlus:
				84	case kRegexpQuest:
				85	case kRegexpEmptyMatch:
				86	case kRegexpNoMatch:
				87	return false;
				88	default:
				89	break;
				90	}
				91	return true;
				92	case kRegexpRepeat:
				93	return false;
				94	}
				95	LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
				96	return false;
				97	}
				98
				99	// Walker subclass used by Simplify.
				100	// The simplify walk is purely post-recursive: given the simplified children,
				101	// PostVisit creates the simplified result.
				102	// The child_args are simplified Regexp*s.
				103	class SimplifyWalker : public Regexp::Walker<Regexp*> {
				104	public:
				105	SimplifyWalker() {}
				106	virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
				107	virtual Regexp* PostVisit(Regexp* re,
				108	Regexp* parent_arg,
				109	Regexp* pre_arg,
				110	Regexp** child_args, int nchild_args);
				111	virtual Regexp* Copy(Regexp* re);
				112	virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
				113
				114	private:
				115	// These functions are declared inside SimplifyWalker so that
				116	// they can edit the private fields of the Regexps they construct.
				117
				118	// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
				119	// Caller must Decref return value when done with it.
				120	static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
				121
				122	// Simplifies the expression re{min,max} in terms of *, +, and ?.
				123	// Returns a new regexp. Does not edit re. Does not consume reference to re.
				124	// Caller must Decref return value when done with it.
				125	static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
				126	Regexp::ParseFlags parse_flags);
				127
				128	// Simplifies a character class by expanding any named classes
				129	// into rune ranges. Does not edit re. Does not consume ref to re.
				130	// Caller must Decref return value when done with it.
				131	static Regexp* SimplifyCharClass(Regexp* re);
				132
				133	DISALLOW_EVIL_CONSTRUCTORS(SimplifyWalker);
				134	};
				135
				136	// Simplifies a regular expression, returning a new regexp.
				137	// The new regexp uses traditional Unix egrep features only,
				138	// plus the Perl (?:) non-capturing parentheses.
				139	// Otherwise, no POSIX or Perl additions. The new regexp
				140	// captures exactly the same subexpressions (with the same indices)
				141	// as the original.
				142	// Does not edit current object.
				143	// Caller must Decref() return value when done with it.
				144
				145	Regexp* Regexp::Simplify() {
				146	if (simple_)
				147	return Incref();
				148	SimplifyWalker w;
				149	return w.Walk(this, NULL);
				150	}
				151
				152	#define Simplify DontCallSimplify // Avoid accidental recursion
				153
				154	Regexp* SimplifyWalker::Copy(Regexp* re) {
				155	return re->Incref();
				156	}
				157
				158	Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
				159	// This should never be called, since we use Walk and not
				160	// WalkExponential.
				161	LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
				162	return re->Incref();
				163	}
				164
				165	Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
				166	if (re->simple_) {
				167	*stop = true;
				168	return re->Incref();
				169	}
				170	return NULL;
				171	}
				172
				173	Regexp* SimplifyWalker::PostVisit(Regexp* re,
				174	Regexp* parent_arg,
				175	Regexp* pre_arg,
				176	Regexp** child_args,
				177	int nchild_args) {
				178	switch (re->op()) {
				179	case kRegexpNoMatch:
				180	case kRegexpEmptyMatch:
				181	case kRegexpLiteral:
				182	case kRegexpLiteralString:
				183	case kRegexpBeginLine:
				184	case kRegexpEndLine:
				185	case kRegexpBeginText:
				186	case kRegexpWordBoundary:
				187	case kRegexpNoWordBoundary:
				188	case kRegexpEndText:
				189	case kRegexpAnyChar:
				190	case kRegexpAnyByte:
				191	case kRegexpHaveMatch:
				192	// All these are always simple.
				193	re->simple_ = true;
				194	return re->Incref();
				195
				196	case kRegexpConcat:
				197	case kRegexpAlternate: {
				198	// These are simple as long as the subpieces are simple.
				199	// Two passes to avoid allocation in the common case.
				200	bool changed = false;
				201	Regexp** subs = re->sub();
				202	for (int i = 0; i < re->nsub_; i++) {
				203	Regexp* sub = subs[i];
				204	Regexp* newsub = child_args[i];
				205	if (newsub != sub) {
				206	changed = true;
				207	break;
				208	}
				209	}
				210	if (!changed) {
				211	for (int i = 0; i < re->nsub_; i++) {
				212	Regexp* newsub = child_args[i];
				213	newsub->Decref();
				214	}
				215	re->simple_ = true;
				216	return re->Incref();
				217	}
				218	Regexp* nre = new Regexp(re->op(), re->parse_flags());
				219	nre->AllocSub(re->nsub_);
				220	Regexp** nre_subs = nre->sub();
				221	for (int i = 0; i <re->nsub_; i++)
				222	nre_subs[i] = child_args[i];
				223	nre->simple_ = true;
				224	return nre;
				225	}
				226
				227	case kRegexpCapture: {
				228	Regexp* newsub = child_args[0];
				229	if (newsub == re->sub()[0]) {
				230	newsub->Decref();
				231	re->simple_ = true;
				232	return re->Incref();
				233	}
				234	Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
				235	nre->AllocSub(1);
				236	nre->sub()[0] = newsub;
				237	nre->cap_ = re->cap_;
				238	nre->simple_ = true;
				239	return nre;
				240	}
				241
				242	case kRegexpStar:
				243	case kRegexpPlus:
				244	case kRegexpQuest: {
				245	Regexp* newsub = child_args[0];
				246	// Special case: repeat the empty string as much as
				247	// you want, but it's still the empty string.
				248	if (newsub->op() == kRegexpEmptyMatch)
				249	return newsub;
				250
				251	// These are simple as long as the subpiece is simple.
				252	if (newsub == re->sub()[0]) {
				253	newsub->Decref();
				254	re->simple_ = true;
				255	return re->Incref();
				256	}
				257
				258	// These are also idempotent if flags are constant.
				259	if (re->op() == newsub->op() &&
				260	re->parse_flags() == newsub->parse_flags())
				261	return newsub;
				262
				263	Regexp* nre = new Regexp(re->op(), re->parse_flags());
				264	nre->AllocSub(1);
				265	nre->sub()[0] = newsub;
				266	nre->simple_ = true;
				267	return nre;
				268	}
				269
				270	case kRegexpRepeat: {
				271	Regexp* newsub = child_args[0];
				272	// Special case: repeat the empty string as much as
				273	// you want, but it's still the empty string.
				274	if (newsub->op() == kRegexpEmptyMatch)
				275	return newsub;
				276
				277	Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
				278	re->parse_flags());
				279	newsub->Decref();
				280	nre->simple_ = true;
				281	return nre;
				282	}
				283
				284	case kRegexpCharClass: {
				285	Regexp* nre = SimplifyCharClass(re);
				286	nre->simple_ = true;
				287	return nre;
				288	}
				289	}
				290
				291	LOG(ERROR) << "Simplify case not handled: " << re->op();
				292	return re->Incref();
				293	}
				294
				295	// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
				296	// Returns a new Regexp, handing the ref to the caller.
				297	Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
				298	Regexp::ParseFlags parse_flags) {
				299	Regexp* re = new Regexp(kRegexpConcat, parse_flags);
				300	re->AllocSub(2);
				301	Regexp** subs = re->sub();
				302	subs[0] = re1;
				303	subs[1] = re2;
				304	return re;
				305	}
				306
				307	// Simplifies the expression re{min,max} in terms of *, +, and ?.
				308	// Returns a new regexp. Does not edit re. Does not consume reference to re.
				309	// Caller must Decref return value when done with it.
				310	// The result will not necessarily have the right capturing parens
				311	// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
				312	// but in the Regexp* representation, both (x) are marked as $1.
				313	Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
				314	Regexp::ParseFlags f) {
				315	// x{n,} means at least n matches of x.
				316	if (max == -1) {
				317	// Special case: x{0,} is x*
				318	if (min == 0)
				319	return Regexp::Star(re->Incref(), f);
				320
				321	// Special case: x{1,} is x+
				322	if (min == 1)
				323	return Regexp::Plus(re->Incref(), f);
				324
				325	// General case: x{4,} is xxxx+
				326	Regexp* nre = new Regexp(kRegexpConcat, f);
				327	nre->AllocSub(min);
				328	VLOG(1) << "Simplify " << min;
				329	Regexp** nre_subs = nre->sub();
				330	for (int i = 0; i < min-1; i++)
				331	nre_subs[i] = re->Incref();
				332	nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
				333	return nre;
				334	}
				335
				336	// Special case: (x){0} matches only empty string.
				337	if (min == 0 && max == 0)
				338	return new Regexp(kRegexpEmptyMatch, f);
				339
				340	// Special case: x{1} is just x.
				341	if (min == 1 && max == 1)
				342	return re->Incref();
				343
				344	// General case: x{n,m} means n copies of x and m copies of x?.
				345	// The machine will do less work if we nest the final m copies,
				346	// so that x{2,5} = xx(x(x(x)?)?)?
				347
				348	// Build leading prefix: xx. Capturing only on the last one.
				349	Regexp* nre = NULL;
				350	if (min > 0) {
				351	nre = new Regexp(kRegexpConcat, f);
				352	nre->AllocSub(min);
				353	Regexp** nre_subs = nre->sub();
				354	for (int i = 0; i < min; i++)
				355	nre_subs[i] = re->Incref();
				356	}
				357
				358	// Build and attach suffix: (x(x(x)?)?)?
				359	if (max > min) {
				360	Regexp* suf = Regexp::Quest(re->Incref(), f);
				361	for (int i = min+1; i < max; i++)
				362	suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
				363	if (nre == NULL)
				364	nre = suf;
				365	else
				366	nre = Concat2(nre, suf, f);
				367	}
				368
				369	if (nre == NULL) {
				370	// Some degenerate case, like min > max, or min < max < 0.
				371	// This shouldn't happen, because the parser rejects such regexps.
				372	LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
				373	return new Regexp(kRegexpNoMatch, f);
				374	}
				375
				376	return nre;
				377	}
				378
				379	// Simplifies a character class.
				380	// Caller must Decref return value when done with it.
				381	Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
				382	CharClass* cc = re->cc();
				383
				384	// Special cases
				385	if (cc->empty())
				386	return new Regexp(kRegexpNoMatch, re->parse_flags());
				387	if (cc->full())
				388	return new Regexp(kRegexpAnyChar, re->parse_flags());
				389
				390	return re->Incref();
				391	}
				392
				393	} // namespace re2