blob: 003dc5add05696f9dfcdb88936aa1230831a265e [file] [log] [blame]
Alexander Gutkin0d4c5232013-02-28 13:47:27 +00001// Copyright 2008 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Regular expression engine tester -- test all the implementations against each other.
6
7#include "util/util.h"
8#include "util/flags.h"
9#include "re2/testing/tester.h"
10#include "re2/prog.h"
11#include "re2/re2.h"
12#include "re2/regexp.h"
13
14DEFINE_bool(dump_prog, false, "dump regexp program");
15DEFINE_bool(log_okay, false, "log successful runs");
16DEFINE_bool(dump_rprog, false, "dump reversed regexp program");
17
18DEFINE_int32(max_regexp_failures, 100,
19 "maximum number of regexp test failures (-1 = unlimited)");
20
21DEFINE_string(regexp_engines, "", "pattern to select regexp engines to test");
22
23namespace re2 {
24
25enum {
26 kMaxSubmatch = 1+16, // $0...$16
27};
28
29const char* engine_types[kEngineMax] = {
30 "Backtrack",
31 "NFA",
32 "DFA",
33 "DFA1",
34 "OnePass",
35 "BitState",
36 "RE2",
37 "RE2a",
38 "RE2b",
39 "PCRE",
40};
41
42// Returns the name string for the type t.
43static string EngineString(Engine t) {
44 if (t < 0 || t >= arraysize(engine_types) || engine_types[t] == NULL) {
45 return StringPrintf("type%d", static_cast<int>(t));
46 }
47 return engine_types[t];
48}
49
50// Returns bit mask of engines to use.
51static uint32 Engines() {
52 static uint32 cached_engines;
53 static bool did_parse;
54
55 if (did_parse)
56 return cached_engines;
57
58 if (FLAGS_regexp_engines.empty()) {
59 cached_engines = ~0;
60 } else {
61 for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++)
62 if (strstr(EngineString(i).c_str(), FLAGS_regexp_engines.c_str()))
63 cached_engines |= 1<<i;
64 }
65
66 if (cached_engines == 0)
67 LOG(INFO) << "Warning: no engines enabled.";
68 if (!UsingPCRE)
69 cached_engines &= ~(1<<kEnginePCRE);
70 for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) {
71 if (cached_engines & (1<<i))
72 LOG(INFO) << EngineString(i) << " enabled";
73 }
74 did_parse = true;
75 return cached_engines;
76}
77
78// The result of running a match.
79struct TestInstance::Result {
80 bool skipped; // test skipped: wasn't applicable
81 bool matched; // found a match
82 bool untrusted; // don't really trust the answer
83 bool have_submatch; // computed all submatch info
84 bool have_submatch0; // computed just submatch[0]
85 StringPiece submatch[kMaxSubmatch];
86};
87
88typedef TestInstance::Result Result;
89
90// Formats a single capture range s in text in the form (a,b)
91// where a and b are the starting and ending offsets of s in text.
92static string FormatCapture(const StringPiece& text, const StringPiece& s) {
93 if (s.begin() == NULL)
94 return "(?,?)";
95 return StringPrintf("(%d,%d)",
96 static_cast<int>(s.begin() - text.begin()),
97 static_cast<int>(s.end() - text.begin()));
98}
99
100// Returns whether text contains non-ASCII (>= 0x80) bytes.
101static bool NonASCII(const StringPiece& text) {
102 for (int i = 0; i < text.size(); i++)
103 if ((uint8)text[i] >= 0x80)
104 return true;
105 return false;
106}
107
108// Returns string representation of match kind.
109static string FormatKind(Prog::MatchKind kind) {
110 switch (kind) {
111 case Prog::kFullMatch:
112 return "full match";
113 case Prog::kLongestMatch:
114 return "longest match";
115 case Prog::kFirstMatch:
116 return "first match";
117 case Prog::kManyMatch:
118 return "many match";
119 }
120 return "???";
121}
122
123// Returns string representation of anchor kind.
124static string FormatAnchor(Prog::Anchor anchor) {
125 switch (anchor) {
126 case Prog::kAnchored:
127 return "anchored";
128 case Prog::kUnanchored:
129 return "unanchored";
130 }
131 return "???";
132}
133
134struct ParseMode {
135 Regexp::ParseFlags parse_flags;
136 string desc;
137};
138
139static const Regexp::ParseFlags single_line =
140 Regexp::LikePerl;
141static const Regexp::ParseFlags multi_line =
142 static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine);
143
144static ParseMode parse_modes[] = {
145 { single_line, "single-line" },
146 { single_line|Regexp::Latin1, "single-line, latin1" },
147 { multi_line, "multiline" },
148 { multi_line|Regexp::NonGreedy, "multiline, nongreedy" },
149 { multi_line|Regexp::Latin1, "multiline, latin1" },
150};
151
152static string FormatMode(Regexp::ParseFlags flags) {
153 for (int i = 0; i < arraysize(parse_modes); i++)
154 if (parse_modes[i].parse_flags == flags)
155 return parse_modes[i].desc;
156 return StringPrintf("%#x", static_cast<uint>(flags));
157}
158
159// Constructs and saves all the matching engines that
160// will be required for the given tests.
161TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
162 Regexp::ParseFlags flags)
163 : regexp_str_(regexp_str),
164 kind_(kind),
165 flags_(flags),
166 error_(false),
167 regexp_(NULL),
168 num_captures_(0),
169 prog_(NULL),
170 rprog_(NULL),
171 re_(NULL),
172 re2_(NULL) {
173
174 VLOG(1) << CEscape(regexp_str);
175
176 // Compile regexp to prog.
177 // Always required - needed for backtracking (reference implementation).
178 RegexpStatus status;
179 regexp_ = Regexp::Parse(regexp_str, flags, &status);
180 if (regexp_ == NULL) {
181 LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
182 << " mode: " << FormatMode(flags);
183 error_ = true;
184 return;
185 }
186 num_captures_ = regexp_->NumCaptures();
187 prog_ = regexp_->CompileToProg(0);
188 if (prog_ == NULL) {
189 LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_);
190 error_ = true;
191 return;
192 }
193 if (FLAGS_dump_prog) {
194 LOG(INFO) << "Prog for "
195 << " regexp "
196 << CEscape(regexp_str_)
197 << " (" << FormatKind(kind_)
198 << ", " << FormatMode(flags_)
199 << ")\n"
200 << prog_->Dump();
201 }
202
203 // Compile regexp to reversed prog. Only needed for DFA engines.
204 if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) {
205 rprog_ = regexp_->CompileToReverseProg(0);
206 if (rprog_ == NULL) {
207 LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_);
208 error_ = true;
209 return;
210 }
211 if (FLAGS_dump_rprog)
212 LOG(INFO) << rprog_->Dump();
213 }
214
215 // Create re string that will be used for RE and RE2.
216 string re = regexp_str.as_string();
217 // Accomodate flags.
218 // Regexp::Latin1 will be accomodated below.
219 if (!(flags & Regexp::OneLine))
220 re = "(?m)" + re;
221 if (flags & Regexp::NonGreedy)
222 re = "(?U)" + re;
223 if (flags & Regexp::DotNL)
224 re = "(?s)" + re;
225
226 // Compile regexp to RE2.
227 if (Engines() & ((1<<kEngineRE2)|(1<<kEngineRE2a)|(1<<kEngineRE2b))) {
228 RE2::Options options;
229 if (flags & Regexp::Latin1)
230 options.set_encoding(RE2::Options::EncodingLatin1);
231 if (kind_ == Prog::kLongestMatch)
232 options.set_longest_match(true);
233 re2_ = new RE2(re, options);
234 if (!re2_->error().empty()) {
235 LOG(INFO) << "Cannot RE2: " << CEscape(re);
236 error_ = true;
237 return;
238 }
239 }
240
241 // Compile regexp to RE.
242 // PCRE as exposed by the RE interface isn't always usable.
243 // 1. It disagrees about handling of empty-string reptitions
244 // like matching (a*)* against "b". PCRE treats the (a*) as
245 // occurring once, while we treat it as occurring not at all.
246 // 2. It treats $ as this weird thing meaning end of string
247 // or before the \n at the end of the string.
248 // 3. It doesn't implement POSIX leftmost-longest matching.
249 // MimicsPCRE() detects 1 and 2.
250 if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() &&
251 kind_ != Prog::kLongestMatch) {
252 PCRE_Options o;
253 o.set_option(PCRE::UTF8);
254 if (flags & Regexp::Latin1)
255 o.set_option(PCRE::None);
256 // PCRE has interface bug keeping us from finding $0, so
257 // add one more layer of parens.
258 re_ = new PCRE("("+re+")", o);
259 if (!re_->error().empty()) {
260 LOG(INFO) << "Cannot PCRE: " << CEscape(re);
261 error_ = true;
262 return;
263 }
264 }
265}
266
267TestInstance::~TestInstance() {
268 if (regexp_)
269 regexp_->Decref();
270 delete prog_;
271 delete rprog_;
272 delete re_;
273 delete re2_;
274}
275
276// Runs a single search using the named engine type.
277// This interface hides all the irregularities of the various
278// engine interfaces from the rest of this file.
279void TestInstance::RunSearch(Engine type,
280 const StringPiece& orig_text,
281 const StringPiece& orig_context,
282 Prog::Anchor anchor,
283 Result *result) {
284 memset(result, 0, sizeof *result);
285 if (regexp_ == NULL) {
286 result->skipped = true;
287 return;
288 }
289 int nsubmatch = 1 + num_captures_; // NumCaptures doesn't count $0
290 if (nsubmatch > kMaxSubmatch)
291 nsubmatch = kMaxSubmatch;
292
293 StringPiece text = orig_text;
294 StringPiece context = orig_context;
295
296 switch (type) {
297 default:
298 LOG(FATAL) << "Bad RunSearch type: " << (int)type;
299
300 case kEngineBacktrack:
301 if (prog_ == NULL) {
302 result->skipped = true;
303 break;
304 }
305 result->matched =
306 prog_->UnsafeSearchBacktrack(text, context, anchor, kind_,
307 result->submatch, nsubmatch);
308 result->have_submatch = true;
309 break;
310
311 case kEngineNFA:
312 if (prog_ == NULL) {
313 result->skipped = true;
314 break;
315 }
316 result->matched =
317 prog_->SearchNFA(text, context, anchor, kind_,
318 result->submatch, nsubmatch);
319 result->have_submatch = true;
320 break;
321
322 case kEngineDFA:
323 if (prog_ == NULL) {
324 result->skipped = true;
325 break;
326 }
327 result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL,
328 &result->skipped, NULL);
329 break;
330
331 case kEngineDFA1:
332 if (prog_ == NULL || rprog_ == NULL) {
333 result->skipped = true;
334 break;
335 }
336 result->matched =
337 prog_->SearchDFA(text, context, anchor, kind_, result->submatch,
338 &result->skipped, NULL);
339 // If anchored, no need for second run,
340 // but do it anyway to find more bugs.
341 if (result->matched) {
342 if (!rprog_->SearchDFA(result->submatch[0], context,
343 Prog::kAnchored, Prog::kLongestMatch,
344 result->submatch,
345 &result->skipped, NULL)) {
346 LOG(ERROR) << "Reverse DFA inconsistency: " << CEscape(regexp_str_)
347 << " on " << CEscape(text);
348 result->matched = false;
349 }
350 }
351 result->have_submatch0 = true;
352 break;
353
354 case kEngineOnePass:
355 if (prog_ == NULL ||
356 anchor == Prog::kUnanchored ||
357 !prog_->IsOnePass() ||
358 nsubmatch > Prog::kMaxOnePassCapture) {
359 result->skipped = true;
360 break;
361 }
362 result->matched = prog_->SearchOnePass(text, context, anchor, kind_,
363 result->submatch, nsubmatch);
364 result->have_submatch = true;
365 break;
366
367 case kEngineBitState:
368 if (prog_ == NULL) {
369 result->skipped = true;
370 break;
371 }
372 result->matched = prog_->SearchBitState(text, context, anchor, kind_,
373 result->submatch, nsubmatch);
374 result->have_submatch = true;
375 break;
376
377 case kEngineRE2:
378 case kEngineRE2a:
379 case kEngineRE2b: {
380 if (!re2_ || text.end() != context.end()) {
381 result->skipped = true;
382 break;
383 }
384
385 RE2::Anchor re_anchor;
386 if (anchor == Prog::kAnchored)
387 re_anchor = RE2::ANCHOR_START;
388 else
389 re_anchor = RE2::UNANCHORED;
390 if (kind_ == Prog::kFullMatch)
391 re_anchor = RE2::ANCHOR_BOTH;
392
393 result->matched = re2_->Match(context,
394 text.begin() - context.begin(),
395 text.end() - context.begin(),
396 re_anchor, result->submatch, nsubmatch);
397 result->have_submatch = nsubmatch > 0;
398 break;
399 }
400
401 case kEnginePCRE: {
402 if (!re_ || text.begin() != context.begin() ||
403 text.end() != context.end()) {
404 result->skipped = true;
405 break;
406 }
407
408 const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch];
409 PCRE::Arg *a = new PCRE::Arg[nsubmatch];
410 for (int i = 0; i < nsubmatch; i++) {
411 a[i] = PCRE::Arg(&result->submatch[i]);
412 argptr[i] = &a[i];
413 }
414 int consumed;
415 PCRE::Anchor pcre_anchor;
416 if (anchor == Prog::kAnchored)
417 pcre_anchor = PCRE::ANCHOR_START;
418 else
419 pcre_anchor = PCRE::UNANCHORED;
420 if (kind_ == Prog::kFullMatch)
421 pcre_anchor = PCRE::ANCHOR_BOTH;
422 re_->ClearHitLimit();
423 result->matched =
424 re_->DoMatch(text,
425 pcre_anchor,
426 &consumed,
427 argptr, nsubmatch);
428 if (re_->HitLimit()) {
429 result->untrusted = true;
430 delete[] argptr;
431 delete[] a;
432 break;
433 }
434 result->have_submatch = true;
435
436 // Work around RE interface bug: PCRE returns -1 as the
437 // offsets for an unmatched subexpression, and RE should
438 // turn that into StringPiece(NULL) but in fact it uses
439 // StringPiece(text.begin() - 1, 0). Oops.
440 for (int i = 0; i < nsubmatch; i++)
441 if (result->submatch[i].begin() == text.begin() - 1)
442 result->submatch[i] = NULL;
443 delete[] argptr;
444 delete[] a;
445 break;
446 }
447 }
448
449 if (!result->matched)
450 memset(result->submatch, 0, sizeof result->submatch);
451}
452
453// Checks whether r is okay given that correct is the right answer.
454// Specifically, r's answers have to match (but it doesn't have to
455// claim to have all the answers).
456static bool ResultOkay(const Result& r, const Result& correct) {
457 if (r.skipped)
458 return true;
459 if (r.matched != correct.matched)
460 return false;
461 if (r.have_submatch || r.have_submatch0) {
462 for (int i = 0; i < kMaxSubmatch; i++) {
463 if (correct.submatch[i].begin() != r.submatch[i].begin() ||
464 correct.submatch[i].size() != r.submatch[i].size())
465 return false;
466 if (!r.have_submatch)
467 break;
468 }
469 }
470 return true;
471}
472
473// Runs a single test.
474bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
475 Prog::Anchor anchor) {
476 // Backtracking is the gold standard.
477 Result correct;
478 RunSearch(kEngineBacktrack, text, context, anchor, &correct);
479 if (correct.skipped) {
480 if (regexp_ == NULL)
481 return true;
482 LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
483 << " " << FormatMode(flags_);
484 return false;
485 }
486 VLOG(1) << "Try: regexp " << CEscape(regexp_str_)
487 << " text " << CEscape(text)
488 << " (" << FormatKind(kind_)
489 << ", " << FormatAnchor(anchor)
490 << ", " << FormatMode(flags_)
491 << ")";
492
493 // Compare the others.
494 bool all_okay = true;
495 for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) {
496 if (!(Engines() & (1<<i)))
497 continue;
498
499 Result r;
500 RunSearch(i, text, context, anchor, &r);
501 if (ResultOkay(r, correct)) {
502 if (FLAGS_log_okay)
503 LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor);
504 continue;
505 }
506
507 // We disagree with PCRE on the meaning of some Unicode matches.
508 // In particular, we treat all non-ASCII UTF-8 as word characters.
509 // We also treat "empty" character sets like [^\w\W] as being
510 // impossible to match, while PCRE apparently excludes some code
511 // points (e.g., 0x0080) from both \w and \W.
512 if (i == kEnginePCRE && NonASCII(text))
513 continue;
514
515 if (!r.untrusted)
516 all_okay = false;
517
518 LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text,
519 context, anchor);
520 if (r.matched != correct.matched) {
521 if (r.matched) {
522 LOG(INFO) << " Should not match (but does).";
523 } else {
524 LOG(INFO) << " Should match (but does not).";
525 continue;
526 }
527 }
528 for (int i = 0; i < 1+num_captures_; i++) {
529 if (r.submatch[i].begin() != correct.submatch[i].begin() ||
530 r.submatch[i].end() != correct.submatch[i].end()) {
531 LOG(INFO) <<
532 StringPrintf(" $%d: should be %s is %s",
533 i,
534 FormatCapture(text, correct.submatch[i]).c_str(),
535 FormatCapture(text, r.submatch[i]).c_str());
536 } else {
537 LOG(INFO) <<
538 StringPrintf(" $%d: %s ok", i,
539 FormatCapture(text, r.submatch[i]).c_str());
540 }
541 }
542 }
543
544 if (!all_okay) {
545 if (FLAGS_max_regexp_failures > 0 && --FLAGS_max_regexp_failures == 0)
546 LOG(QFATAL) << "Too many regexp failures.";
547 }
548
549 return all_okay;
550}
551
552void TestInstance::LogMatch(const char* prefix, Engine e,
553 const StringPiece& text, const StringPiece& context,
554 Prog::Anchor anchor) {
555 LOG(INFO) << prefix
556 << EngineString(e)
557 << " regexp "
558 << CEscape(regexp_str_)
559 << " "
560 << CEscape(regexp_->ToString())
561 << " text "
562 << CEscape(text)
563 << " ("
564 << text.begin() - context.begin()
565 << ","
566 << text.end() - context.begin()
567 << ") of context "
568 << CEscape(context)
569 << " (" << FormatKind(kind_)
570 << ", " << FormatAnchor(anchor)
571 << ", " << FormatMode(flags_)
572 << ")";
573}
574
575static Prog::MatchKind kinds[] = {
576 Prog::kFirstMatch,
577 Prog::kLongestMatch,
578 Prog::kFullMatch,
579};
580
581// Test all possible match kinds and parse modes.
582Tester::Tester(const StringPiece& regexp) {
583 error_ = false;
584 for (int i = 0; i < arraysize(kinds); i++) {
585 for (int j = 0; j < arraysize(parse_modes); j++) {
586 TestInstance* t = new TestInstance(regexp, kinds[i],
587 parse_modes[j].parse_flags);
588 error_ |= t->error();
589 v_.push_back(t);
590 }
591 }
592}
593
594Tester::~Tester() {
595 for (int i = 0; i < v_.size(); i++)
596 delete v_[i];
597}
598
599bool Tester::TestCase(const StringPiece& text, const StringPiece& context,
600 Prog::Anchor anchor) {
601 bool okay = true;
602 for (int i = 0; i < v_.size(); i++)
603 okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor));
604 return okay;
605}
606
607static Prog::Anchor anchors[] = {
608 Prog::kAnchored,
609 Prog::kUnanchored
610};
611
612bool Tester::TestInput(const StringPiece& text) {
613 bool okay = TestInputInContext(text, text);
614 if (text.size() > 0) {
615 StringPiece sp;
616 sp = text;
617 sp.remove_prefix(1);
618 okay &= TestInputInContext(sp, text);
619 sp = text;
620 sp.remove_suffix(1);
621 okay &= TestInputInContext(sp, text);
622 }
623 return okay;
624}
625
626bool Tester::TestInputInContext(const StringPiece& text,
627 const StringPiece& context) {
628 bool okay = true;
629 for (int i = 0; i < arraysize(anchors); i++)
630 okay &= TestCase(text, context, anchors[i]);
631 return okay;
632}
633
634bool TestRegexpOnText(const StringPiece& regexp,
635 const StringPiece& text) {
636 Tester t(regexp);
637 return t.TestInput(text);
638}
639
640} // namespace re2