Blame - re2/testing/tester.cc - fp2-dev/platform/external/regex-re2

blob: 003dc5add05696f9dfcdb88936aa1230831a265e [file] [log] [blame]

Alexander Gutkin	0d4c523	2013-02-28 13:47:27 +0000	[diff] [blame]	1	// Copyright 2008 The RE2 Authors. All Rights Reserved.
				2	// Use of this source code is governed by a BSD-style
				3	// license that can be found in the LICENSE file.
				4
				5	// Regular expression engine tester -- test all the implementations against each other.
				6
				7	#include "util/util.h"
				8	#include "util/flags.h"
				9	#include "re2/testing/tester.h"
				10	#include "re2/prog.h"
				11	#include "re2/re2.h"
				12	#include "re2/regexp.h"
				13
				14	DEFINE_bool(dump_prog, false, "dump regexp program");
				15	DEFINE_bool(log_okay, false, "log successful runs");
				16	DEFINE_bool(dump_rprog, false, "dump reversed regexp program");
				17
				18	DEFINE_int32(max_regexp_failures, 100,
				19	"maximum number of regexp test failures (-1 = unlimited)");
				20
				21	DEFINE_string(regexp_engines, "", "pattern to select regexp engines to test");
				22
				23	namespace re2 {
				24
				25	enum {
				26	kMaxSubmatch = 1+16, // $0...$16
				27	};
				28
				29	const char* engine_types[kEngineMax] = {
				30	"Backtrack",
				31	"NFA",
				32	"DFA",
				33	"DFA1",
				34	"OnePass",
				35	"BitState",
				36	"RE2",
				37	"RE2a",
				38	"RE2b",
				39	"PCRE",
				40	};
				41
				42	// Returns the name string for the type t.
				43	static string EngineString(Engine t) {
				44	if (t < 0 \|\| t >= arraysize(engine_types) \|\| engine_types[t] == NULL) {
				45	return StringPrintf("type%d", static_cast<int>(t));
				46	}
				47	return engine_types[t];
				48	}
				49
				50	// Returns bit mask of engines to use.
				51	static uint32 Engines() {
				52	static uint32 cached_engines;
				53	static bool did_parse;
				54
				55	if (did_parse)
				56	return cached_engines;
				57
				58	if (FLAGS_regexp_engines.empty()) {
				59	cached_engines = ~0;
				60	} else {
				61	for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++)
				62	if (strstr(EngineString(i).c_str(), FLAGS_regexp_engines.c_str()))
				63	cached_engines \|= 1<<i;
				64	}
				65
				66	if (cached_engines == 0)
				67	LOG(INFO) << "Warning: no engines enabled.";
				68	if (!UsingPCRE)
				69	cached_engines &= ~(1<<kEnginePCRE);
				70	for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) {
				71	if (cached_engines & (1<<i))
				72	LOG(INFO) << EngineString(i) << " enabled";
				73	}
				74	did_parse = true;
				75	return cached_engines;
				76	}
				77
				78	// The result of running a match.
				79	struct TestInstance::Result {
				80	bool skipped; // test skipped: wasn't applicable
				81	bool matched; // found a match
				82	bool untrusted; // don't really trust the answer
				83	bool have_submatch; // computed all submatch info
				84	bool have_submatch0; // computed just submatch[0]
				85	StringPiece submatch[kMaxSubmatch];
				86	};
				87
				88	typedef TestInstance::Result Result;
				89
				90	// Formats a single capture range s in text in the form (a,b)
				91	// where a and b are the starting and ending offsets of s in text.
				92	static string FormatCapture(const StringPiece& text, const StringPiece& s) {
				93	if (s.begin() == NULL)
				94	return "(?,?)";
				95	return StringPrintf("(%d,%d)",
				96	static_cast<int>(s.begin() - text.begin()),
				97	static_cast<int>(s.end() - text.begin()));
				98	}
				99
				100	// Returns whether text contains non-ASCII (>= 0x80) bytes.
				101	static bool NonASCII(const StringPiece& text) {
				102	for (int i = 0; i < text.size(); i++)
				103	if ((uint8)text[i] >= 0x80)
				104	return true;
				105	return false;
				106	}
				107
				108	// Returns string representation of match kind.
				109	static string FormatKind(Prog::MatchKind kind) {
				110	switch (kind) {
				111	case Prog::kFullMatch:
				112	return "full match";
				113	case Prog::kLongestMatch:
				114	return "longest match";
				115	case Prog::kFirstMatch:
				116	return "first match";
				117	case Prog::kManyMatch:
				118	return "many match";
				119	}
				120	return "???";
				121	}
				122
				123	// Returns string representation of anchor kind.
				124	static string FormatAnchor(Prog::Anchor anchor) {
				125	switch (anchor) {
				126	case Prog::kAnchored:
				127	return "anchored";
				128	case Prog::kUnanchored:
				129	return "unanchored";
				130	}
				131	return "???";
				132	}
				133
				134	struct ParseMode {
				135	Regexp::ParseFlags parse_flags;
				136	string desc;
				137	};
				138
				139	static const Regexp::ParseFlags single_line =
				140	Regexp::LikePerl;
				141	static const Regexp::ParseFlags multi_line =
				142	static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine);
				143
				144	static ParseMode parse_modes[] = {
				145	{ single_line, "single-line" },
				146	{ single_line\|Regexp::Latin1, "single-line, latin1" },
				147	{ multi_line, "multiline" },
				148	{ multi_line\|Regexp::NonGreedy, "multiline, nongreedy" },
				149	{ multi_line\|Regexp::Latin1, "multiline, latin1" },
				150	};
				151
				152	static string FormatMode(Regexp::ParseFlags flags) {
				153	for (int i = 0; i < arraysize(parse_modes); i++)
				154	if (parse_modes[i].parse_flags == flags)
				155	return parse_modes[i].desc;
				156	return StringPrintf("%#x", static_cast<uint>(flags));
				157	}
				158
				159	// Constructs and saves all the matching engines that
				160	// will be required for the given tests.
				161	TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
				162	Regexp::ParseFlags flags)
				163	: regexp_str_(regexp_str),
				164	kind_(kind),
				165	flags_(flags),
				166	error_(false),
				167	regexp_(NULL),
				168	num_captures_(0),
				169	prog_(NULL),
				170	rprog_(NULL),
				171	re_(NULL),
				172	re2_(NULL) {
				173
				174	VLOG(1) << CEscape(regexp_str);
				175
				176	// Compile regexp to prog.
				177	// Always required - needed for backtracking (reference implementation).
				178	RegexpStatus status;
				179	regexp_ = Regexp::Parse(regexp_str, flags, &status);
				180	if (regexp_ == NULL) {
				181	LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
				182	<< " mode: " << FormatMode(flags);
				183	error_ = true;
				184	return;
				185	}
				186	num_captures_ = regexp_->NumCaptures();
				187	prog_ = regexp_->CompileToProg(0);
				188	if (prog_ == NULL) {
				189	LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_);
				190	error_ = true;
				191	return;
				192	}
				193	if (FLAGS_dump_prog) {
				194	LOG(INFO) << "Prog for "
				195	<< " regexp "
				196	<< CEscape(regexp_str_)
				197	<< " (" << FormatKind(kind_)
				198	<< ", " << FormatMode(flags_)
				199	<< ")\n"
				200	<< prog_->Dump();
				201	}
				202
				203	// Compile regexp to reversed prog. Only needed for DFA engines.
				204	if (Engines() & ((1<<kEngineDFA)\|(1<<kEngineDFA1))) {
				205	rprog_ = regexp_->CompileToReverseProg(0);
				206	if (rprog_ == NULL) {
				207	LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_);
				208	error_ = true;
				209	return;
				210	}
				211	if (FLAGS_dump_rprog)
				212	LOG(INFO) << rprog_->Dump();
				213	}
				214
				215	// Create re string that will be used for RE and RE2.
				216	string re = regexp_str.as_string();
				217	// Accomodate flags.
				218	// Regexp::Latin1 will be accomodated below.
				219	if (!(flags & Regexp::OneLine))
				220	re = "(?m)" + re;
				221	if (flags & Regexp::NonGreedy)
				222	re = "(?U)" + re;
				223	if (flags & Regexp::DotNL)
				224	re = "(?s)" + re;
				225
				226	// Compile regexp to RE2.
				227	if (Engines() & ((1<<kEngineRE2)\|(1<<kEngineRE2a)\|(1<<kEngineRE2b))) {
				228	RE2::Options options;
				229	if (flags & Regexp::Latin1)
				230	options.set_encoding(RE2::Options::EncodingLatin1);
				231	if (kind_ == Prog::kLongestMatch)
				232	options.set_longest_match(true);
				233	re2_ = new RE2(re, options);
				234	if (!re2_->error().empty()) {
				235	LOG(INFO) << "Cannot RE2: " << CEscape(re);
				236	error_ = true;
				237	return;
				238	}
				239	}
				240
				241	// Compile regexp to RE.
				242	// PCRE as exposed by the RE interface isn't always usable.
				243	// 1. It disagrees about handling of empty-string reptitions
				244	// like matching (a) against "b". PCRE treats the (a*) as
				245	// occurring once, while we treat it as occurring not at all.
				246	// 2. It treats $ as this weird thing meaning end of string
				247	// or before the \n at the end of the string.
				248	// 3. It doesn't implement POSIX leftmost-longest matching.
				249	// MimicsPCRE() detects 1 and 2.
				250	if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() &&
				251	kind_ != Prog::kLongestMatch) {
				252	PCRE_Options o;
				253	o.set_option(PCRE::UTF8);
				254	if (flags & Regexp::Latin1)
				255	o.set_option(PCRE::None);
				256	// PCRE has interface bug keeping us from finding $0, so
				257	// add one more layer of parens.
				258	re_ = new PCRE("("+re+")", o);
				259	if (!re_->error().empty()) {
				260	LOG(INFO) << "Cannot PCRE: " << CEscape(re);
				261	error_ = true;
				262	return;
				263	}
				264	}
				265	}
				266
				267	TestInstance::~TestInstance() {
				268	if (regexp_)
				269	regexp_->Decref();
				270	delete prog_;
				271	delete rprog_;
				272	delete re_;
				273	delete re2_;
				274	}
				275
				276	// Runs a single search using the named engine type.
				277	// This interface hides all the irregularities of the various
				278	// engine interfaces from the rest of this file.
				279	void TestInstance::RunSearch(Engine type,
				280	const StringPiece& orig_text,
				281	const StringPiece& orig_context,
				282	Prog::Anchor anchor,
				283	Result *result) {
				284	memset(result, 0, sizeof *result);
				285	if (regexp_ == NULL) {
				286	result->skipped = true;
				287	return;
				288	}
				289	int nsubmatch = 1 + num_captures_; // NumCaptures doesn't count $0
				290	if (nsubmatch > kMaxSubmatch)
				291	nsubmatch = kMaxSubmatch;
				292
				293	StringPiece text = orig_text;
				294	StringPiece context = orig_context;
				295
				296	switch (type) {
				297	default:
				298	LOG(FATAL) << "Bad RunSearch type: " << (int)type;
				299
				300	case kEngineBacktrack:
				301	if (prog_ == NULL) {
				302	result->skipped = true;
				303	break;
				304	}
				305	result->matched =
				306	prog_->UnsafeSearchBacktrack(text, context, anchor, kind_,
				307	result->submatch, nsubmatch);
				308	result->have_submatch = true;
				309	break;
				310
				311	case kEngineNFA:
				312	if (prog_ == NULL) {
				313	result->skipped = true;
				314	break;
				315	}
				316	result->matched =
				317	prog_->SearchNFA(text, context, anchor, kind_,
				318	result->submatch, nsubmatch);
				319	result->have_submatch = true;
				320	break;
				321
				322	case kEngineDFA:
				323	if (prog_ == NULL) {
				324	result->skipped = true;
				325	break;
				326	}
				327	result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL,
				328	&result->skipped, NULL);
				329	break;
				330
				331	case kEngineDFA1:
				332	if (prog_ == NULL \|\| rprog_ == NULL) {
				333	result->skipped = true;
				334	break;
				335	}
				336	result->matched =
				337	prog_->SearchDFA(text, context, anchor, kind_, result->submatch,
				338	&result->skipped, NULL);
				339	// If anchored, no need for second run,
				340	// but do it anyway to find more bugs.
				341	if (result->matched) {
				342	if (!rprog_->SearchDFA(result->submatch[0], context,
				343	Prog::kAnchored, Prog::kLongestMatch,
				344	result->submatch,
				345	&result->skipped, NULL)) {
				346	LOG(ERROR) << "Reverse DFA inconsistency: " << CEscape(regexp_str_)
				347	<< " on " << CEscape(text);
				348	result->matched = false;
				349	}
				350	}
				351	result->have_submatch0 = true;
				352	break;
				353
				354	case kEngineOnePass:
				355	if (prog_ == NULL \|\|
				356	anchor == Prog::kUnanchored \|\|
				357	!prog_->IsOnePass() \|\|
				358	nsubmatch > Prog::kMaxOnePassCapture) {
				359	result->skipped = true;
				360	break;
				361	}
				362	result->matched = prog_->SearchOnePass(text, context, anchor, kind_,
				363	result->submatch, nsubmatch);
				364	result->have_submatch = true;
				365	break;
				366
				367	case kEngineBitState:
				368	if (prog_ == NULL) {
				369	result->skipped = true;
				370	break;
				371	}
				372	result->matched = prog_->SearchBitState(text, context, anchor, kind_,
				373	result->submatch, nsubmatch);
				374	result->have_submatch = true;
				375	break;
				376
				377	case kEngineRE2:
				378	case kEngineRE2a:
				379	case kEngineRE2b: {
				380	if (!re2_ \|\| text.end() != context.end()) {
				381	result->skipped = true;
				382	break;
				383	}
				384
				385	RE2::Anchor re_anchor;
				386	if (anchor == Prog::kAnchored)
				387	re_anchor = RE2::ANCHOR_START;
				388	else
				389	re_anchor = RE2::UNANCHORED;
				390	if (kind_ == Prog::kFullMatch)
				391	re_anchor = RE2::ANCHOR_BOTH;
				392
				393	result->matched = re2_->Match(context,
				394	text.begin() - context.begin(),
				395	text.end() - context.begin(),
				396	re_anchor, result->submatch, nsubmatch);
				397	result->have_submatch = nsubmatch > 0;
				398	break;
				399	}
				400
				401	case kEnginePCRE: {
				402	if (!re_ \|\| text.begin() != context.begin() \|\|
				403	text.end() != context.end()) {
				404	result->skipped = true;
				405	break;
				406	}
				407
				408	const PCRE::Arg *argptr = new const PCRE::Arg[nsubmatch];
				409	PCRE::Arg *a = new PCRE::Arg[nsubmatch];
				410	for (int i = 0; i < nsubmatch; i++) {
				411	a[i] = PCRE::Arg(&result->submatch[i]);
				412	argptr[i] = &a[i];
				413	}
				414	int consumed;
				415	PCRE::Anchor pcre_anchor;
				416	if (anchor == Prog::kAnchored)
				417	pcre_anchor = PCRE::ANCHOR_START;
				418	else
				419	pcre_anchor = PCRE::UNANCHORED;
				420	if (kind_ == Prog::kFullMatch)
				421	pcre_anchor = PCRE::ANCHOR_BOTH;
				422	re_->ClearHitLimit();
				423	result->matched =
				424	re_->DoMatch(text,
				425	pcre_anchor,
				426	&consumed,
				427	argptr, nsubmatch);
				428	if (re_->HitLimit()) {
				429	result->untrusted = true;
				430	delete[] argptr;
				431	delete[] a;
				432	break;
				433	}
				434	result->have_submatch = true;
				435
				436	// Work around RE interface bug: PCRE returns -1 as the
				437	// offsets for an unmatched subexpression, and RE should
				438	// turn that into StringPiece(NULL) but in fact it uses
				439	// StringPiece(text.begin() - 1, 0). Oops.
				440	for (int i = 0; i < nsubmatch; i++)
				441	if (result->submatch[i].begin() == text.begin() - 1)
				442	result->submatch[i] = NULL;
				443	delete[] argptr;
				444	delete[] a;
				445	break;
				446	}
				447	}
				448
				449	if (!result->matched)
				450	memset(result->submatch, 0, sizeof result->submatch);
				451	}
				452
				453	// Checks whether r is okay given that correct is the right answer.
				454	// Specifically, r's answers have to match (but it doesn't have to
				455	// claim to have all the answers).
				456	static bool ResultOkay(const Result& r, const Result& correct) {
				457	if (r.skipped)
				458	return true;
				459	if (r.matched != correct.matched)
				460	return false;
				461	if (r.have_submatch \|\| r.have_submatch0) {
				462	for (int i = 0; i < kMaxSubmatch; i++) {
				463	if (correct.submatch[i].begin() != r.submatch[i].begin() \|\|
				464	correct.submatch[i].size() != r.submatch[i].size())
				465	return false;
				466	if (!r.have_submatch)
				467	break;
				468	}
				469	}
				470	return true;
				471	}
				472
				473	// Runs a single test.
				474	bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
				475	Prog::Anchor anchor) {
				476	// Backtracking is the gold standard.
				477	Result correct;
				478	RunSearch(kEngineBacktrack, text, context, anchor, &correct);
				479	if (correct.skipped) {
				480	if (regexp_ == NULL)
				481	return true;
				482	LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
				483	<< " " << FormatMode(flags_);
				484	return false;
				485	}
				486	VLOG(1) << "Try: regexp " << CEscape(regexp_str_)
				487	<< " text " << CEscape(text)
				488	<< " (" << FormatKind(kind_)
				489	<< ", " << FormatAnchor(anchor)
				490	<< ", " << FormatMode(flags_)
				491	<< ")";
				492
				493	// Compare the others.
				494	bool all_okay = true;
				495	for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) {
				496	if (!(Engines() & (1<<i)))
				497	continue;
				498
				499	Result r;
				500	RunSearch(i, text, context, anchor, &r);
				501	if (ResultOkay(r, correct)) {
				502	if (FLAGS_log_okay)
				503	LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor);
				504	continue;
				505	}
				506
				507	// We disagree with PCRE on the meaning of some Unicode matches.
				508	// In particular, we treat all non-ASCII UTF-8 as word characters.
				509	// We also treat "empty" character sets like [^\w\W] as being
				510	// impossible to match, while PCRE apparently excludes some code
				511	// points (e.g., 0x0080) from both \w and \W.
				512	if (i == kEnginePCRE && NonASCII(text))
				513	continue;
				514
				515	if (!r.untrusted)
				516	all_okay = false;
				517
				518	LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text,
				519	context, anchor);
				520	if (r.matched != correct.matched) {
				521	if (r.matched) {
				522	LOG(INFO) << " Should not match (but does).";
				523	} else {
				524	LOG(INFO) << " Should match (but does not).";
				525	continue;
				526	}
				527	}
				528	for (int i = 0; i < 1+num_captures_; i++) {
				529	if (r.submatch[i].begin() != correct.submatch[i].begin() \|\|
				530	r.submatch[i].end() != correct.submatch[i].end()) {
				531	LOG(INFO) <<
				532	StringPrintf(" $%d: should be %s is %s",
				533	i,
				534	FormatCapture(text, correct.submatch[i]).c_str(),
				535	FormatCapture(text, r.submatch[i]).c_str());
				536	} else {
				537	LOG(INFO) <<
				538	StringPrintf(" $%d: %s ok", i,
				539	FormatCapture(text, r.submatch[i]).c_str());
				540	}
				541	}
				542	}
				543
				544	if (!all_okay) {
				545	if (FLAGS_max_regexp_failures > 0 && --FLAGS_max_regexp_failures == 0)
				546	LOG(QFATAL) << "Too many regexp failures.";
				547	}
				548
				549	return all_okay;
				550	}
				551
				552	void TestInstance::LogMatch(const char* prefix, Engine e,
				553	const StringPiece& text, const StringPiece& context,
				554	Prog::Anchor anchor) {
				555	LOG(INFO) << prefix
				556	<< EngineString(e)
				557	<< " regexp "
				558	<< CEscape(regexp_str_)
				559	<< " "
				560	<< CEscape(regexp_->ToString())
				561	<< " text "
				562	<< CEscape(text)
				563	<< " ("
				564	<< text.begin() - context.begin()
				565	<< ","
				566	<< text.end() - context.begin()
				567	<< ") of context "
				568	<< CEscape(context)
				569	<< " (" << FormatKind(kind_)
				570	<< ", " << FormatAnchor(anchor)
				571	<< ", " << FormatMode(flags_)
				572	<< ")";
				573	}
				574
				575	static Prog::MatchKind kinds[] = {
				576	Prog::kFirstMatch,
				577	Prog::kLongestMatch,
				578	Prog::kFullMatch,
				579	};
				580
				581	// Test all possible match kinds and parse modes.
				582	Tester::Tester(const StringPiece& regexp) {
				583	error_ = false;
				584	for (int i = 0; i < arraysize(kinds); i++) {
				585	for (int j = 0; j < arraysize(parse_modes); j++) {
				586	TestInstance* t = new TestInstance(regexp, kinds[i],
				587	parse_modes[j].parse_flags);
				588	error_ \|= t->error();
				589	v_.push_back(t);
				590	}
				591	}
				592	}
				593
				594	Tester::~Tester() {
				595	for (int i = 0; i < v_.size(); i++)
				596	delete v_[i];
				597	}
				598
				599	bool Tester::TestCase(const StringPiece& text, const StringPiece& context,
				600	Prog::Anchor anchor) {
				601	bool okay = true;
				602	for (int i = 0; i < v_.size(); i++)
				603	okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor));
				604	return okay;
				605	}
				606
				607	static Prog::Anchor anchors[] = {
				608	Prog::kAnchored,
				609	Prog::kUnanchored
				610	};
				611
				612	bool Tester::TestInput(const StringPiece& text) {
				613	bool okay = TestInputInContext(text, text);
				614	if (text.size() > 0) {
				615	StringPiece sp;
				616	sp = text;
				617	sp.remove_prefix(1);
				618	okay &= TestInputInContext(sp, text);
				619	sp = text;
				620	sp.remove_suffix(1);
				621	okay &= TestInputInContext(sp, text);
				622	}
				623	return okay;
				624	}
				625
				626	bool Tester::TestInputInContext(const StringPiece& text,
				627	const StringPiece& context) {
				628	bool okay = true;
				629	for (int i = 0; i < arraysize(anchors); i++)
				630	okay &= TestCase(text, context, anchors[i]);
				631	return okay;
				632	}
				633
				634	bool TestRegexpOnText(const StringPiece& regexp,
				635	const StringPiece& text) {
				636	Tester t(regexp);
				637	return t.TestInput(text);
				638	}
				639
				640	} // namespace re2