Alexander Gutkin | 0d4c523 | 2013-02-28 13:47:27 +0000 | [diff] [blame] | 1 | // Copyright 2008 The RE2 Authors. All Rights Reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | // Exhaustive testing of regular expression matching. |
| 6 | |
| 7 | // Each test picks an alphabet (e.g., "abc"), a maximum string length, |
| 8 | // a maximum regular expression length, and a maximum number of letters |
| 9 | // that can appear in the regular expression. Given these parameters, |
| 10 | // it tries every possible regular expression and string, verifying that |
| 11 | // the NFA, DFA, and a trivial backtracking implementation agree about |
| 12 | // the location of the match. |
| 13 | |
| 14 | #include <stdlib.h> |
| 15 | #include <stdio.h> |
| 16 | |
| 17 | #ifndef LOGGING |
| 18 | #define LOGGING 0 |
| 19 | #endif |
| 20 | |
| 21 | #include "util/test.h" |
| 22 | #include "re2/testing/exhaustive_tester.h" |
| 23 | #include "re2/testing/tester.h" |
| 24 | |
| 25 | DEFINE_bool(show_regexps, false, "show regexps during testing"); |
| 26 | |
| 27 | DEFINE_int32(max_bad_regexp_inputs, 1, |
| 28 | "Stop testing a regular expression after finding this many " |
| 29 | "strings that break it."); |
| 30 | |
| 31 | // Compiled in debug mode, the usual tests run for over an hour. |
| 32 | // Have to cut it down to make the unit test machines happy. |
| 33 | DEFINE_bool(quick_debug_mode, true, "Run fewer tests in debug mode."); |
| 34 | |
| 35 | namespace re2 { |
| 36 | |
| 37 | static char* escape(const StringPiece& sp) { |
| 38 | static char buf[512]; |
| 39 | char* p = buf; |
| 40 | *p++ = '\"'; |
| 41 | for (int i = 0; i < sp.size(); i++) { |
| 42 | if(p+5 >= buf+sizeof buf) |
| 43 | LOG(FATAL) << "ExhaustiveTester escape: too long"; |
| 44 | if(sp[i] == '\\' || sp[i] == '\"') { |
| 45 | *p++ = '\\'; |
| 46 | *p++ = sp[i]; |
| 47 | } else if(sp[i] == '\n') { |
| 48 | *p++ = '\\'; |
| 49 | *p++ = 'n'; |
| 50 | } else { |
| 51 | *p++ = sp[i]; |
| 52 | } |
| 53 | } |
| 54 | *p++ = '\"'; |
| 55 | *p = '\0'; |
| 56 | return buf; |
| 57 | } |
| 58 | |
| 59 | static void PrintResult(const RE2& re, const StringPiece& input, RE2::Anchor anchor, StringPiece *m, int n) { |
| 60 | if (!re.Match(input, 0, input.size(), anchor, m, n)) { |
| 61 | printf("-"); |
| 62 | return; |
| 63 | } |
| 64 | for (int i = 0; i < n; i++) { |
| 65 | if (i > 0) |
| 66 | printf(" "); |
| 67 | if (m[i].begin() == NULL) |
| 68 | printf("-"); |
| 69 | else |
| 70 | printf("%d-%d", static_cast<int>(m[i].begin() - input.begin()), static_cast<int>(m[i].end() - input.begin())); |
| 71 | } |
| 72 | } |
| 73 | |
| 74 | // Processes a single generated regexp. |
| 75 | // Compiles it using Regexp interface and PCRE, and then |
| 76 | // checks that NFA, DFA, and PCRE all return the same results. |
| 77 | void ExhaustiveTester::HandleRegexp(const string& const_regexp) { |
| 78 | regexps_++; |
| 79 | string regexp = const_regexp; |
| 80 | if (!topwrapper_.empty()) |
| 81 | regexp = StringPrintf(topwrapper_.c_str(), regexp.c_str()); |
| 82 | |
| 83 | if (FLAGS_show_regexps) { |
| 84 | printf("\r%s", regexp.c_str()); |
| 85 | fflush(stdout); |
| 86 | } |
| 87 | |
| 88 | if (LOGGING) { |
| 89 | // Write out test cases and answers for use in testing |
| 90 | // other implementations, such as Go's regexp package. |
| 91 | if (randomstrings_) |
| 92 | LOG(ERROR) << "Cannot log with random strings."; |
| 93 | if (regexps_ == 1) { // first |
| 94 | printf("strings\n"); |
| 95 | strgen_.Reset(); |
| 96 | while (strgen_.HasNext()) |
| 97 | printf("%s\n", escape(strgen_.Next())); |
| 98 | printf("regexps\n"); |
| 99 | } |
| 100 | printf("%s\n", escape(regexp)); |
| 101 | |
| 102 | RE2 re(regexp); |
| 103 | RE2::Options longest; |
| 104 | longest.set_longest_match(true); |
| 105 | RE2 relongest(regexp, longest); |
| 106 | int ngroup = re.NumberOfCapturingGroups()+1; |
| 107 | StringPiece* group = new StringPiece[ngroup]; |
| 108 | |
| 109 | strgen_.Reset(); |
| 110 | while (strgen_.HasNext()) { |
| 111 | StringPiece input = strgen_.Next(); |
| 112 | PrintResult(re, input, RE2::ANCHOR_BOTH, group, ngroup); |
| 113 | printf(";"); |
| 114 | PrintResult(re, input, RE2::UNANCHORED, group, ngroup); |
| 115 | printf(";"); |
| 116 | PrintResult(relongest, input, RE2::ANCHOR_BOTH, group, ngroup); |
| 117 | printf(";"); |
| 118 | PrintResult(relongest, input, RE2::UNANCHORED, group, ngroup); |
| 119 | printf("\n"); |
| 120 | } |
| 121 | delete[] group; |
| 122 | return; |
| 123 | } |
| 124 | |
| 125 | Tester tester(regexp); |
| 126 | if (tester.error()) |
| 127 | return; |
| 128 | |
| 129 | strgen_.Reset(); |
| 130 | strgen_.GenerateNULL(); |
| 131 | if (randomstrings_) |
| 132 | strgen_.Random(stringseed_, stringcount_); |
| 133 | int bad_inputs = 0; |
| 134 | while (strgen_.HasNext()) { |
| 135 | tests_++; |
| 136 | if (!tester.TestInput(strgen_.Next())) { |
| 137 | failures_++; |
| 138 | if (++bad_inputs >= FLAGS_max_bad_regexp_inputs) |
| 139 | break; |
| 140 | } |
| 141 | } |
| 142 | } |
| 143 | |
| 144 | // Runs an exhaustive test on the given parameters. |
| 145 | void ExhaustiveTest(int maxatoms, int maxops, |
| 146 | const vector<string>& alphabet, |
| 147 | const vector<string>& ops, |
| 148 | int maxstrlen, const vector<string>& stralphabet, |
| 149 | const string& wrapper, |
| 150 | const string& topwrapper) { |
| 151 | if (DEBUG_MODE && FLAGS_quick_debug_mode) { |
| 152 | if (maxatoms > 1) |
| 153 | maxatoms--; |
| 154 | if (maxops > 1) |
| 155 | maxops--; |
| 156 | if (maxstrlen > 1) |
| 157 | maxstrlen--; |
| 158 | } |
| 159 | ExhaustiveTester t(maxatoms, maxops, alphabet, ops, |
| 160 | maxstrlen, stralphabet, wrapper, |
| 161 | topwrapper); |
| 162 | t.Generate(); |
| 163 | if (!LOGGING) { |
| 164 | printf("%d regexps, %d tests, %d failures [%d/%d str]\n", |
| 165 | t.regexps(), t.tests(), t.failures(), maxstrlen, (int)stralphabet.size()); |
| 166 | } |
| 167 | EXPECT_EQ(0, t.failures()); |
| 168 | } |
| 169 | |
| 170 | // Runs an exhaustive test using the given parameters and |
| 171 | // the basic egrep operators. |
| 172 | void EgrepTest(int maxatoms, int maxops, const string& alphabet, |
| 173 | int maxstrlen, const string& stralphabet, |
| 174 | const string& wrapper) { |
| 175 | const char* tops[] = { "", "^(?:%s)", "(?:%s)$", "^(?:%s)$" }; |
| 176 | |
| 177 | for (int i = 0; i < arraysize(tops); i++) { |
| 178 | ExhaustiveTest(maxatoms, maxops, |
| 179 | Split("", alphabet), |
| 180 | RegexpGenerator::EgrepOps(), |
| 181 | maxstrlen, |
| 182 | Split("", stralphabet), |
| 183 | wrapper, |
| 184 | tops[i]); |
| 185 | } |
| 186 | } |
| 187 | |
| 188 | } // namespace re2 |