Blame - re2/regexp.cc - fp2-dev/platform/external/regex-re2

blob: a74ceec8271fd8c5550b9414d33ef2d0504f3246 [file] [log] [blame]

Ian Hodson	2ee91b4	2012-05-14 12:29:36 +0100	[diff] [blame]	1	// Copyright 2006 The RE2 Authors. All Rights Reserved.
				2	// Use of this source code is governed by a BSD-style
				3	// license that can be found in the LICENSE file.
				4
				5	// Regular expression representation.
				6	// Tested by parse_test.cc
				7
				8	#include "util/util.h"
				9	#include "re2/regexp.h"
				10	#include "re2/stringpiece.h"
				11	#include "re2/walker-inl.h"
				12
				13	namespace re2 {
				14
				15	// Constructor. Allocates vectors as appropriate for operator.
				16	Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
				17	: op_(op),
				18	simple_(false),
				19	parse_flags_(static_cast<uint16>(parse_flags)),
				20	ref_(1),
				21	nsub_(0),
				22	down_(NULL) {
				23	subone_ = NULL;
				24	memset(the_union_, 0, sizeof the_union_);
				25	}
				26
				27	// Destructor. Assumes already cleaned up children.
				28	// Private: use Decref() instead of delete to destroy Regexps.
				29	// Can't call Decref on the sub-Regexps here because
				30	// that could cause arbitrarily deep recursion, so
				31	// required Decref() to have handled them for us.
				32	Regexp::~Regexp() {
				33	if (nsub_ > 0)
				34	LOG(DFATAL) << "Regexp not destroyed.";
				35
				36	switch (op_) {
				37	default:
				38	break;
				39	case kRegexpCapture:
				40	delete name_;
				41	break;
				42	case kRegexpLiteralString:
				43	delete[] runes_;
				44	break;
				45	case kRegexpCharClass:
				46	cc_->Delete();
				47	delete ccb_;
				48	break;
				49	}
				50	}
				51
				52	// If it's possible to destroy this regexp without recurring,
				53	// do so and return true. Else return false.
				54	bool Regexp::QuickDestroy() {
				55	if (nsub_ == 0) {
				56	delete this;
				57	return true;
				58	}
				59	return false;
				60	}
				61
Alexander Gutkin	0d4c523	2013-02-28 13:47:27 +0000	[diff] [blame]	62	static map<Regexp, int> ref_map;
				63	GLOBAL_MUTEX(ref_mutex);
Ian Hodson	2ee91b4	2012-05-14 12:29:36 +0100	[diff] [blame]	64
				65	int Regexp::Ref() {
				66	if (ref_ < kMaxRef)
				67	return ref_;
				68
Alexander Gutkin	0d4c523	2013-02-28 13:47:27 +0000	[diff] [blame]	69	GLOBAL_MUTEX_LOCK(ref_mutex);
				70	int r = 0;
				71	if (ref_map != NULL) {
				72	r = (*ref_map)[this];
				73	}
				74	GLOBAL_MUTEX_UNLOCK(ref_mutex);
				75	return r;
Ian Hodson	2ee91b4	2012-05-14 12:29:36 +0100	[diff] [blame]	76	}
				77
				78	// Increments reference count, returns object as convenience.
				79	Regexp* Regexp::Incref() {
				80	if (ref_ >= kMaxRef-1) {
				81	// Store ref count in overflow map.
Alexander Gutkin	0d4c523	2013-02-28 13:47:27 +0000	[diff] [blame]	82	GLOBAL_MUTEX_LOCK(ref_mutex);
				83	if (ref_map == NULL) {
				84	ref_map = new map<Regexp*, int>;
Ian Hodson	2ee91b4	2012-05-14 12:29:36 +0100	[diff] [blame]	85	}
Alexander Gutkin	0d4c523	2013-02-28 13:47:27 +0000	[diff] [blame]	86	if (ref_ == kMaxRef) {
				87	// already overflowed
				88	(*ref_map)[this]++;
				89	} else {
				90	// overflowing now
				91	(*ref_map)[this] = kMaxRef;
				92	ref_ = kMaxRef;
				93	}
				94	GLOBAL_MUTEX_UNLOCK(ref_mutex);
Ian Hodson	2ee91b4	2012-05-14 12:29:36 +0100	[diff] [blame]	95	return this;
				96	}
				97
				98	ref_++;
				99	return this;
				100	}
				101
				102	// Decrements reference count and deletes this object if count reaches 0.
				103	void Regexp::Decref() {
				104	if (ref_ == kMaxRef) {
				105	// Ref count is stored in overflow map.
Alexander Gutkin	0d4c523	2013-02-28 13:47:27 +0000	[diff] [blame]	106	GLOBAL_MUTEX_LOCK(ref_mutex);
				107	int r = (*ref_map)[this] - 1;
Ian Hodson	2ee91b4	2012-05-14 12:29:36 +0100	[diff] [blame]	108	if (r < kMaxRef) {
				109	ref_ = r;
Alexander Gutkin	0d4c523	2013-02-28 13:47:27 +0000	[diff] [blame]	110	ref_map->erase(this);
Ian Hodson	2ee91b4	2012-05-14 12:29:36 +0100	[diff] [blame]	111	} else {
Alexander Gutkin	0d4c523	2013-02-28 13:47:27 +0000	[diff] [blame]	112	(*ref_map)[this] = r;
Ian Hodson	2ee91b4	2012-05-14 12:29:36 +0100	[diff] [blame]	113	}
Alexander Gutkin	0d4c523	2013-02-28 13:47:27 +0000	[diff] [blame]	114	GLOBAL_MUTEX_UNLOCK(ref_mutex);
Ian Hodson	2ee91b4	2012-05-14 12:29:36 +0100	[diff] [blame]	115	return;
				116	}
				117	ref_--;
				118	if (ref_ == 0)
				119	Destroy();
				120	}
				121
				122	// Deletes this object; ref count has count reached 0.
				123	void Regexp::Destroy() {
				124	if (QuickDestroy())
				125	return;
				126
				127	// Handle recursive Destroy with explicit stack
				128	// to avoid arbitrarily deep recursion on process stack [sigh].
				129	down_ = NULL;
				130	Regexp* stack = this;
				131	while (stack != NULL) {
				132	Regexp* re = stack;
				133	stack = re->down_;
				134	if (re->ref_ != 0)
				135	LOG(DFATAL) << "Bad reference count " << re->ref_;
				136	if (re->nsub_ > 0) {
				137	Regexp** subs = re->sub();
				138	for (int i = 0; i < re->nsub_; i++) {
				139	Regexp* sub = subs[i];
				140	if (sub == NULL)
				141	continue;
				142	if (sub->ref_ == kMaxRef)
				143	sub->Decref();
				144	else
				145	--sub->ref_;
				146	if (sub->ref_ == 0 && !sub->QuickDestroy()) {
				147	sub->down_ = stack;
				148	stack = sub;
				149	}
				150	}
				151	if (re->nsub_ > 1)
				152	delete[] subs;
				153	re->nsub_ = 0;
				154	}
				155	delete re;
				156	}
				157	}
				158
				159	void Regexp::AddRuneToString(Rune r) {
				160	DCHECK(op_ == kRegexpLiteralString);
				161	if (nrunes_ == 0) {
				162	// start with 8
				163	runes_ = new Rune[8];
				164	} else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
				165	// double on powers of two
				166	Rune *old = runes_;
				167	runes_ = new Rune[nrunes_ * 2];
				168	for (int i = 0; i < nrunes_; i++)
				169	runes_[i] = old[i];
				170	delete[] old;
				171	}
				172
				173	runes_[nrunes_++] = r;
				174	}
				175
				176	Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
				177	Regexp* re = new Regexp(kRegexpHaveMatch, flags);
				178	re->match_id_ = match_id;
				179	return re;
				180	}
				181
				182	Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
				183	if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
				184	return sub;
				185	Regexp* re = new Regexp(kRegexpPlus, flags);
				186	re->AllocSub(1);
				187	re->sub()[0] = sub;
				188	return re;
				189	}
				190
				191	Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
				192	if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
				193	return sub;
				194	Regexp* re = new Regexp(kRegexpStar, flags);
				195	re->AllocSub(1);
				196	re->sub()[0] = sub;
				197	return re;
				198	}
				199
				200	Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
				201	if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
				202	return sub;
				203	Regexp* re = new Regexp(kRegexpQuest, flags);
				204	re->AllocSub(1);
				205	re->sub()[0] = sub;
				206	return re;
				207	}
				208
				209	Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
				210	ParseFlags flags, bool can_factor) {
				211	if (nsub == 1)
				212	return sub[0];
				213
				214	Regexp** subcopy = NULL;
				215	if (op == kRegexpAlternate && can_factor) {
				216	// Going to edit sub; make a copy so we don't step on caller.
				217	subcopy = new Regexp*[nsub];
				218	memmove(subcopy, sub, nsub * sizeof sub[0]);
				219	sub = subcopy;
				220	nsub = FactorAlternation(sub, nsub, flags);
				221	if (nsub == 1) {
				222	Regexp* re = sub[0];
				223	delete[] subcopy;
				224	return re;
				225	}
				226	}
				227
				228	if (nsub > kMaxNsub) {
				229	// Too many subexpressions to fit in a single Regexp.
				230	// Make a two-level tree. Two levels gets us to 65535^2.
				231	int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
				232	Regexp* re = new Regexp(op, flags);
				233	re->AllocSub(nbigsub);
				234	Regexp** subs = re->sub();
				235	for (int i = 0; i < nbigsub - 1; i++)
				236	subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
				237	subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
				238	nsub - (nbigsub-1)*kMaxNsub, flags,
				239	false);
				240	delete[] subcopy;
				241	return re;
				242	}
				243
				244	Regexp* re = new Regexp(op, flags);
				245	re->AllocSub(nsub);
				246	Regexp** subs = re->sub();
				247	for (int i = 0; i < nsub; i++)
				248	subs[i] = sub[i];
				249
				250	delete[] subcopy;
				251	return re;
				252	}
				253
				254	Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
				255	return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
				256	}
				257
				258	Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
				259	return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
				260	}
				261
				262	Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
				263	return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
				264	}
				265
				266	Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
				267	Regexp* re = new Regexp(kRegexpCapture, flags);
				268	re->AllocSub(1);
				269	re->sub()[0] = sub;
				270	re->cap_ = cap;
				271	return re;
				272	}
				273
				274	Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
				275	Regexp* re = new Regexp(kRegexpRepeat, flags);
				276	re->AllocSub(1);
				277	re->sub()[0] = sub;
				278	re->min_ = min;
				279	re->max_ = max;
				280	return re;
				281	}
				282
				283	Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
				284	Regexp* re = new Regexp(kRegexpLiteral, flags);
				285	re->rune_ = rune;
				286	return re;
				287	}
				288
				289	Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
				290	if (nrunes <= 0)
				291	return new Regexp(kRegexpEmptyMatch, flags);
				292	if (nrunes == 1)
				293	return NewLiteral(runes[0], flags);
				294	Regexp* re = new Regexp(kRegexpLiteralString, flags);
				295	for (int i = 0; i < nrunes; i++)
				296	re->AddRuneToString(runes[i]);
				297	return re;
				298	}
				299
				300	Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
				301	Regexp* re = new Regexp(kRegexpCharClass, flags);
				302	re->cc_ = cc;
				303	return re;
				304	}
				305
				306	// Swaps this and that in place.
				307	void Regexp::Swap(Regexp* that) {
				308	// Can use memmove because Regexp is just a struct (no vtable).
				309	char tmp[sizeof *this];
				310	memmove(tmp, this, sizeof tmp);
				311	memmove(this, that, sizeof tmp);
				312	memmove(that, tmp, sizeof tmp);
				313	}
				314
				315	// Tests equality of all top-level structure but not subregexps.
				316	static bool TopEqual(Regexp* a, Regexp* b) {
				317	if (a->op() != b->op())
				318	return false;
				319
				320	switch (a->op()) {
				321	case kRegexpNoMatch:
				322	case kRegexpEmptyMatch:
				323	case kRegexpAnyChar:
				324	case kRegexpAnyByte:
				325	case kRegexpBeginLine:
				326	case kRegexpEndLine:
				327	case kRegexpWordBoundary:
				328	case kRegexpNoWordBoundary:
				329	case kRegexpBeginText:
				330	return true;
				331
				332	case kRegexpEndText:
				333	// The parse flags remember whether it's \z or (?-m:$),
				334	// which matters when testing against PCRE.
				335	return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
				336
				337	case kRegexpLiteral:
				338	return a->rune() == b->rune() &&
				339	((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
				340
				341	case kRegexpLiteralString:
				342	return a->nrunes() == b->nrunes() &&
				343	((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
				344	memcmp(a->runes(), b->runes(),
				345	a->nrunes() * sizeof a->runes()[0]) == 0;
				346
				347	case kRegexpAlternate:
				348	case kRegexpConcat:
				349	return a->nsub() == b->nsub();
				350
				351	case kRegexpStar:
				352	case kRegexpPlus:
				353	case kRegexpQuest:
				354	return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
				355
				356	case kRegexpRepeat:
				357	return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
				358	a->min() == b->min() &&
				359	a->max() == b->max();
				360
				361	case kRegexpCapture:
				362	return a->cap() == b->cap() && a->name() == b->name();
				363
				364	case kRegexpHaveMatch:
				365	return a->match_id() == b->match_id();
				366
				367	case kRegexpCharClass: {
				368	CharClass* acc = a->cc();
				369	CharClass* bcc = b->cc();
				370	return acc->size() == bcc->size() &&
				371	acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
				372	memcmp(acc->begin(), bcc->begin(),
				373	(acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
				374	}
				375	}
				376
				377	LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
				378	return 0;
				379	}
				380
				381	bool Regexp::Equal(Regexp* a, Regexp* b) {
				382	if (a == NULL \|\| b == NULL)
				383	return a == b;
				384
				385	if (!TopEqual(a, b))
				386	return false;
				387
				388	// Fast path:
				389	// return without allocating vector if there are no subregexps.
				390	switch (a->op()) {
				391	case kRegexpAlternate:
				392	case kRegexpConcat:
				393	case kRegexpStar:
				394	case kRegexpPlus:
				395	case kRegexpQuest:
				396	case kRegexpRepeat:
				397	case kRegexpCapture:
				398	break;
				399
				400	default:
				401	return true;
				402	}
				403
				404	// Committed to doing real work.
				405	// The stack (vector) has pairs of regexps waiting to
				406	// be compared. The regexps are only equal if
				407	// all the pairs end up being equal.
				408	vector<Regexp*> stk;
				409
				410	for (;;) {
				411	// Invariant: TopEqual(a, b) == true.
				412	Regexp* a2;
				413	Regexp* b2;
				414	switch (a->op()) {
				415	default:
				416	break;
				417	case kRegexpAlternate:
				418	case kRegexpConcat:
				419	for (int i = 0; i < a->nsub(); i++) {
				420	a2 = a->sub()[i];
				421	b2 = b->sub()[i];
				422	if (!TopEqual(a2, b2))
				423	return false;
				424	stk.push_back(a2);
				425	stk.push_back(b2);
				426	}
				427	break;
				428
				429	case kRegexpStar:
				430	case kRegexpPlus:
				431	case kRegexpQuest:
				432	case kRegexpRepeat:
				433	case kRegexpCapture:
				434	a2 = a->sub()[0];
				435	b2 = b->sub()[0];
				436	if (!TopEqual(a2, b2))
				437	return false;
				438	// Really:
				439	// stk.push_back(a2);
				440	// stk.push_back(b2);
				441	// break;
				442	// but faster to assign directly and loop.
				443	a = a2;
				444	b = b2;
				445	continue;
				446	}
				447
				448	int n = stk.size();
				449	if (n == 0)
				450	break;
				451
				452	a = stk[n-2];
				453	b = stk[n-1];
				454	stk.resize(n-2);
				455	}
				456
				457	return true;
				458	}
				459
				460	// Keep in sync with enum RegexpStatusCode in regexp.h
Alexander Gutkin	0d4c523	2013-02-28 13:47:27 +0000	[diff] [blame]	461	static const char *kErrorStrings[] = {
Ian Hodson	2ee91b4	2012-05-14 12:29:36 +0100	[diff] [blame]	462	"no error",
				463	"unexpected error",
				464	"invalid escape sequence",
				465	"invalid character class",
				466	"invalid character class range",
				467	"missing ]",
				468	"missing )",
				469	"trailing \\",
				470	"no argument for repetition operator",
				471	"invalid repetition size",
				472	"bad repetition operator",
				473	"invalid perl operator",
				474	"invalid UTF-8",
				475	"invalid named capture group",
				476	};
				477
Alexander Gutkin	0d4c523	2013-02-28 13:47:27 +0000	[diff] [blame]	478	string RegexpStatus::CodeText(enum RegexpStatusCode code) {
Ian Hodson	2ee91b4	2012-05-14 12:29:36 +0100	[diff] [blame]	479	if (code < 0 \|\| code >= arraysize(kErrorStrings))
				480	code = kRegexpInternalError;
				481	return kErrorStrings[code];
				482	}
				483
				484	string RegexpStatus::Text() const {
				485	if (error_arg_.empty())
				486	return CodeText(code_);
				487	string s;
				488	s.append(CodeText(code_));
				489	s.append(": ");
				490	s.append(error_arg_.data(), error_arg_.size());
				491	return s;
				492	}
				493
				494	void RegexpStatus::Copy(const RegexpStatus& status) {
				495	code_ = status.code_;
				496	error_arg_ = status.error_arg_;
				497	}
				498
				499	typedef int Ignored; // Walker<void> doesn't exist
				500
				501	// Walker subclass to count capturing parens in regexp.
				502	class NumCapturesWalker : public Regexp::Walker<Ignored> {
				503	public:
				504	NumCapturesWalker() : ncapture_(0) {}
				505	int ncapture() { return ncapture_; }
				506
				507	virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
				508	if (re->op() == kRegexpCapture)
				509	ncapture_++;
				510	return ignored;
				511	}
				512	virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
				513	// Should never be called: we use Walk not WalkExponential.
				514	LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
				515	return ignored;
				516	}
				517
				518	private:
				519	int ncapture_;
				520	DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker);
				521	};
				522
				523	int Regexp::NumCaptures() {
				524	NumCapturesWalker w;
				525	w.Walk(this, 0);
				526	return w.ncapture();
				527	}
				528
				529	// Walker class to build map of named capture groups and their indices.
				530	class NamedCapturesWalker : public Regexp::Walker<Ignored> {
				531	public:
				532	NamedCapturesWalker() : map_(NULL) {}
				533	~NamedCapturesWalker() { delete map_; }
				534
				535	map<string, int>* TakeMap() {
				536	map<string, int>* m = map_;
				537	map_ = NULL;
				538	return m;
				539	}
				540
				541	Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
				542	if (re->op() == kRegexpCapture && re->name() != NULL) {
				543	// Allocate map once we find a name.
				544	if (map_ == NULL)
				545	map_ = new map<string, int>;
				546
				547	// Record first occurrence of each name.
				548	// (The rule is that if you have the same name
				549	// multiple times, only the leftmost one counts.)
				550	if (map_->find(*re->name()) == map_->end())
				551	(map_)[re->name()] = re->cap();
				552	}
				553	return ignored;
				554	}
				555
				556	virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
				557	// Should never be called: we use Walk not WalkExponential.
				558	LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
				559	return ignored;
				560	}
				561
				562	private:
				563	map<string, int>* map_;
				564	DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker);
				565	};
				566
				567	map<string, int>* Regexp::NamedCaptures() {
				568	NamedCapturesWalker w;
				569	w.Walk(this, 0);
				570	return w.TakeMap();
				571	}
				572
				573	// Walker class to build map from capture group indices to their names.
				574	class CaptureNamesWalker : public Regexp::Walker<Ignored> {
				575	public:
				576	CaptureNamesWalker() : map_(NULL) {}
				577	~CaptureNamesWalker() { delete map_; }
				578
				579	map<int, string>* TakeMap() {
				580	map<int, string>* m = map_;
				581	map_ = NULL;
				582	return m;
				583	}
				584
				585	Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
				586	if (re->op() == kRegexpCapture && re->name() != NULL) {
				587	// Allocate map once we find a name.
				588	if (map_ == NULL)
				589	map_ = new map<int, string>;
				590
				591	(map_)[re->cap()] = re->name();
				592	}
				593	return ignored;
				594	}
				595
				596	virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
				597	// Should never be called: we use Walk not WalkExponential.
				598	LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
				599	return ignored;
				600	}
				601
				602	private:
				603	map<int, string>* map_;
				604	DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker);
				605	};
				606
				607	map<int, string>* Regexp::CaptureNames() {
				608	CaptureNamesWalker w;
				609	w.Walk(this, 0);
				610	return w.TakeMap();
				611	}
				612
				613	// Determines whether regexp matches must be anchored
				614	// with a fixed string prefix. If so, returns the prefix and
				615	// the regexp that remains after the prefix. The prefix might
				616	// be ASCII case-insensitive.
				617	bool Regexp::RequiredPrefix(string prefix, bool foldcase, Regexp** suffix) {
				618	// No need for a walker: the regexp must be of the form
				619	// 1. some number of ^ anchors
				620	// 2. a literal char or string
				621	// 3. the rest
				622	prefix->clear();
				623	*foldcase = false;
				624	*suffix = NULL;
				625	if (op_ != kRegexpConcat)
				626	return false;
				627
				628	// Some number of anchors, then a literal or concatenation.
				629	int i = 0;
				630	Regexp** sub = this->sub();
				631	while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
				632	i++;
				633	if (i == 0 \|\| i >= nsub_)
				634	return false;
				635
				636	Regexp* re = sub[i];
				637	switch (re->op_) {
				638	default:
				639	return false;
				640
				641	case kRegexpLiteralString:
				642	// Convert to string in proper encoding.
				643	if (re->parse_flags() & Latin1) {
				644	prefix->resize(re->nrunes_);
				645	for (int j = 0; j < re->nrunes_; j++)
				646	(*prefix)[j] = re->runes_[j];
				647	} else {
				648	// Convert to UTF-8 in place.
				649	// Assume worst-case space and then trim.
				650	prefix->resize(re->nrunes_ * UTFmax);
				651	char p = &(prefix)[0];
				652	for (int j = 0; j < re->nrunes_; j++) {
				653	Rune r = re->runes_[j];
				654	if (r < Runeself)
				655	*p++ = r;
				656	else
				657	p += runetochar(p, &r);
				658	}
				659	prefix->resize(p - &(*prefix)[0]);
				660	}
				661	break;
				662
				663	case kRegexpLiteral:
				664	if ((re->parse_flags() & Latin1) \|\| re->rune_ < Runeself) {
				665	prefix->append(1, re->rune_);
				666	} else {
				667	char buf[UTFmax];
				668	prefix->append(buf, runetochar(buf, &re->rune_));
				669	}
				670	break;
				671	}
				672	*foldcase = (sub[i]->parse_flags() & FoldCase);
				673	i++;
				674
				675	// The rest.
				676	if (i < nsub_) {
				677	for (int j = i; j < nsub_; j++)
				678	sub[j]->Incref();
				679	re = Concat(sub + i, nsub_ - i, parse_flags());
				680	} else {
				681	re = new Regexp(kRegexpEmptyMatch, parse_flags());
				682	}
				683	*suffix = re;
				684	return true;
				685	}
				686
				687	// Character class builder is a balanced binary tree (STL set)
				688	// containing non-overlapping, non-abutting RuneRanges.
				689	// The less-than operator used in the tree treats two
				690	// ranges as equal if they overlap at all, so that
				691	// lookups for a particular Rune are possible.
				692
				693	CharClassBuilder::CharClassBuilder() {
				694	nrunes_ = 0;
				695	upper_ = 0;
				696	lower_ = 0;
				697	}
				698
				699	// Add lo-hi to the class; return whether class got bigger.
				700	bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
				701	if (hi < lo)
				702	return false;
				703
				704	if (lo <= 'z' && hi >= 'A') {
				705	// Overlaps some alpha, maybe not all.
				706	// Update bitmaps telling which ASCII letters are in the set.
				707	Rune lo1 = max<Rune>(lo, 'A');
				708	Rune hi1 = min<Rune>(hi, 'Z');
				709	if (lo1 <= hi1)
				710	upper_ \|= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
				711
				712	lo1 = max<Rune>(lo, 'a');
				713	hi1 = min<Rune>(hi, 'z');
				714	if (lo1 <= hi1)
				715	lower_ \|= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
				716	}
				717
				718	{ // Check whether lo, hi is already in the class.
				719	iterator it = ranges_.find(RuneRange(lo, lo));
				720	if (it != end() && it->lo <= lo && hi <= it->hi)
				721	return false;
				722	}
				723
				724	// Look for a range abutting lo on the left.
				725	// If it exists, take it out and increase our range.
				726	if (lo > 0) {
				727	iterator it = ranges_.find(RuneRange(lo-1, lo-1));
				728	if (it != end()) {
				729	lo = it->lo;
				730	if (it->hi > hi)
				731	hi = it->hi;
				732	nrunes_ -= it->hi - it->lo + 1;
				733	ranges_.erase(it);
				734	}
				735	}
				736
				737	// Look for a range abutting hi on the right.
				738	// If it exists, take it out and increase our range.
				739	if (hi < Runemax) {
				740	iterator it = ranges_.find(RuneRange(hi+1, hi+1));
				741	if (it != end()) {
				742	hi = it->hi;
				743	nrunes_ -= it->hi - it->lo + 1;
				744	ranges_.erase(it);
				745	}
				746	}
				747
				748	// Look for ranges between lo and hi. Take them out.
				749	// This is only safe because the set has no overlapping ranges.
				750	// We've already removed any ranges abutting lo and hi, so
				751	// any that overlap [lo, hi] must be contained within it.
				752	for (;;) {
				753	iterator it = ranges_.find(RuneRange(lo, hi));
				754	if (it == end())
				755	break;
				756	nrunes_ -= it->hi - it->lo + 1;
				757	ranges_.erase(it);
				758	}
				759
				760	// Finally, add [lo, hi].
				761	nrunes_ += hi - lo + 1;
				762	ranges_.insert(RuneRange(lo, hi));
				763	return true;
				764	}
				765
				766	void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
				767	for (iterator it = cc->begin(); it != cc->end(); ++it)
				768	AddRange(it->lo, it->hi);
				769	}
				770
				771	bool CharClassBuilder::Contains(Rune r) {
				772	return ranges_.find(RuneRange(r, r)) != end();
				773	}
				774
				775	// Does the character class behave the same on A-Z as on a-z?
				776	bool CharClassBuilder::FoldsASCII() {
				777	return ((upper_ ^ lower_) & AlphaMask) == 0;
				778	}
				779
				780	CharClassBuilder* CharClassBuilder::Copy() {
				781	CharClassBuilder* cc = new CharClassBuilder;
				782	for (iterator it = begin(); it != end(); ++it)
				783	cc->ranges_.insert(RuneRange(it->lo, it->hi));
				784	cc->upper_ = upper_;
				785	cc->lower_ = lower_;
				786	cc->nrunes_ = nrunes_;
				787	return cc;
				788	}
				789
				790
				791
				792	void CharClassBuilder::RemoveAbove(Rune r) {
				793	if (r >= Runemax)
				794	return;
				795
				796	if (r < 'z') {
				797	if (r < 'a')
				798	lower_ = 0;
				799	else
				800	lower_ &= AlphaMask >> ('z' - r);
				801	}
				802
				803	if (r < 'Z') {
				804	if (r < 'A')
				805	upper_ = 0;
				806	else
				807	upper_ &= AlphaMask >> ('Z' - r);
				808	}
				809
				810	for (;;) {
				811
				812	iterator it = ranges_.find(RuneRange(r + 1, Runemax));
				813	if (it == end())
				814	break;
				815	RuneRange rr = *it;
				816	ranges_.erase(it);
				817	nrunes_ -= rr.hi - rr.lo + 1;
				818	if (rr.lo <= r) {
				819	rr.hi = r;
				820	ranges_.insert(rr);
				821	nrunes_ += rr.hi - rr.lo + 1;
				822	}
				823	}
				824	}
				825
				826	void CharClassBuilder::Negate() {
				827	// Build up negation and then copy in.
				828	// Could edit ranges in place, but C++ won't let me.
				829	vector<RuneRange> v;
				830	v.reserve(ranges_.size() + 1);
				831
				832	// In negation, first range begins at 0, unless
				833	// the current class begins at 0.
				834	iterator it = begin();
				835	if (it == end()) {
				836	v.push_back(RuneRange(0, Runemax));
				837	} else {
				838	int nextlo = 0;
				839	if (it->lo == 0) {
				840	nextlo = it->hi + 1;
				841	++it;
				842	}
				843	for (; it != end(); ++it) {
				844	v.push_back(RuneRange(nextlo, it->lo - 1));
				845	nextlo = it->hi + 1;
				846	}
				847	if (nextlo <= Runemax)
				848	v.push_back(RuneRange(nextlo, Runemax));
				849	}
				850
				851	ranges_.clear();
				852	for (int i = 0; i < v.size(); i++)
				853	ranges_.insert(v[i]);
				854
				855	upper_ = AlphaMask & ~upper_;
				856	lower_ = AlphaMask & ~lower_;
				857	nrunes_ = Runemax+1 - nrunes_;
				858	}
				859
				860	// Character class is a sorted list of ranges.
				861	// The ranges are allocated in the same block as the header,
				862	// necessitating a special allocator and Delete method.
				863
				864	CharClass* CharClass::New(int maxranges) {
				865	CharClass* cc;
				866	uint8* data = new uint8[sizeof cc + maxrangessizeof cc->ranges_[0]];
				867	cc = reinterpret_cast<CharClass*>(data);
				868	cc->ranges_ = reinterpret_cast<RuneRange>(data + sizeof cc);
				869	cc->nranges_ = 0;
				870	cc->folds_ascii_ = false;
				871	cc->nrunes_ = 0;
				872	return cc;
				873	}
				874
				875	void CharClass::Delete() {
				876	if (this == NULL)
				877	return;
				878	uint8 data = reinterpret_cast<uint8>(this);
				879	delete[] data;
				880	}
				881
				882	CharClass* CharClass::Negate() {
				883	CharClass* cc = CharClass::New(nranges_+1);
				884	cc->folds_ascii_ = folds_ascii_;
				885	cc->nrunes_ = Runemax + 1 - nrunes_;
				886	int n = 0;
				887	int nextlo = 0;
				888	for (CharClass::iterator it = begin(); it != end(); ++it) {
				889	if (it->lo == nextlo) {
				890	nextlo = it->hi + 1;
				891	} else {
				892	cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
				893	nextlo = it->hi + 1;
				894	}
				895	}
				896	if (nextlo <= Runemax)
				897	cc->ranges_[n++] = RuneRange(nextlo, Runemax);
				898	cc->nranges_ = n;
				899	return cc;
				900	}
				901
				902	bool CharClass::Contains(Rune r) {
				903	RuneRange* rr = ranges_;
				904	int n = nranges_;
				905	while (n > 0) {
				906	int m = n/2;
				907	if (rr[m].hi < r) {
				908	rr += m+1;
				909	n -= m+1;
				910	} else if (r < rr[m].lo) {
				911	n = m;
				912	} else { // rr[m].lo <= r && r <= rr[m].hi
				913	return true;
				914	}
				915	}
				916	return false;
				917	}
				918
				919	CharClass* CharClassBuilder::GetCharClass() {
				920	CharClass* cc = CharClass::New(ranges_.size());
				921	int n = 0;
				922	for (iterator it = begin(); it != end(); ++it)
				923	cc->ranges_[n++] = *it;
				924	cc->nranges_ = n;
				925	DCHECK_LE(n, ranges_.size());
				926	cc->nrunes_ = nrunes_;
				927	cc->folds_ascii_ = FoldsASCII();
				928	return cc;
				929	}
				930
				931	} // namespace re2