Alexander Gutkin | 0d4c523 | 2013-02-28 13:47:27 +0000 | [diff] [blame] | 1 | // Copyright 2006-2008 The RE2 Authors. All Rights Reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | #include <vector> |
| 6 | #include "util/test.h" |
| 7 | #include "re2/prog.h" |
| 8 | #include "re2/re2.h" |
| 9 | #include "re2/regexp.h" |
| 10 | #include "re2/testing/regexp_generator.h" |
| 11 | #include "re2/testing/string_generator.h" |
| 12 | |
| 13 | namespace re2 { |
| 14 | |
| 15 | // Test that C++ strings are compared as uint8s, not int8s. |
| 16 | // PossibleMatchRange doesn't depend on this, but callers probably will. |
| 17 | TEST(CplusplusStrings, EightBit) { |
| 18 | string s = "\x70"; |
| 19 | string t = "\xA0"; |
| 20 | EXPECT_LT(s, t); |
| 21 | } |
| 22 | |
| 23 | struct PrefixTest { |
| 24 | const char* regexp; |
| 25 | int maxlen; |
| 26 | const char* min; |
| 27 | const char* max; |
| 28 | }; |
| 29 | |
| 30 | static PrefixTest tests[] = { |
| 31 | { "", 10, "", "", }, |
| 32 | { "Abcdef", 10, "Abcdef", "Abcdef" }, |
| 33 | { "abc(def|ghi)", 10, "abcdef", "abcghi" }, |
| 34 | { "a+hello", 10, "aa", "ahello" }, |
| 35 | { "a*hello", 10, "a", "hello" }, |
| 36 | { "def|abc", 10, "abc", "def" }, |
| 37 | { "a(b)(c)[d]", 10, "abcd", "abcd" }, |
| 38 | { "ab(cab|cat)", 10, "abcab", "abcat" }, |
| 39 | { "ab(cab|ca)x", 10, "abcabx", "abcax" }, |
| 40 | { "(ab|x)(c|de)", 10, "abc", "xde" }, |
| 41 | { "(ab|x)?(c|z)?", 10, "", "z" }, |
| 42 | { "[^\\s\\S]", 10, "", "" }, |
| 43 | { "(abc)+", 5, "abc", "abcac" }, |
| 44 | { "(abc)+", 2, "ab", "ac" }, |
| 45 | { "(abc)+", 1, "a", "b" }, |
| 46 | { "[a\xC3\xA1]", 4, "a", "\xC3\xA1" }, |
| 47 | { "a*", 10, "", "ab" }, |
| 48 | |
| 49 | { "(?i)Abcdef", 10, "ABCDEF", "abcdef" }, |
| 50 | { "(?i)abc(def|ghi)", 10, "ABCDEF", "abcghi" }, |
| 51 | { "(?i)a+hello", 10, "AA", "ahello" }, |
| 52 | { "(?i)a*hello", 10, "A", "hello" }, |
| 53 | { "(?i)def|abc", 10, "ABC", "def" }, |
| 54 | { "(?i)a(b)(c)[d]", 10, "ABCD", "abcd" }, |
| 55 | { "(?i)ab(cab|cat)", 10, "ABCAB", "abcat" }, |
| 56 | { "(?i)ab(cab|ca)x", 10, "ABCABX", "abcax" }, |
| 57 | { "(?i)(ab|x)(c|de)", 10, "ABC", "xde" }, |
| 58 | { "(?i)(ab|x)?(c|z)?", 10, "", "z" }, |
| 59 | { "(?i)[^\\s\\S]", 10, "", "" }, |
| 60 | { "(?i)(abc)+", 5, "ABC", "abcac" }, |
| 61 | { "(?i)(abc)+", 2, "AB", "ac" }, |
| 62 | { "(?i)(abc)+", 1, "A", "b" }, |
| 63 | { "(?i)[a\xC3\xA1]", 4, "A", "\xC3\xA1" }, |
| 64 | { "(?i)a*", 10, "", "ab" }, |
| 65 | { "(?i)A*", 10, "", "ab" }, |
| 66 | |
| 67 | { "\\AAbcdef", 10, "Abcdef", "Abcdef" }, |
| 68 | { "\\Aabc(def|ghi)", 10, "abcdef", "abcghi" }, |
| 69 | { "\\Aa+hello", 10, "aa", "ahello" }, |
| 70 | { "\\Aa*hello", 10, "a", "hello" }, |
| 71 | { "\\Adef|abc", 10, "abc", "def" }, |
| 72 | { "\\Aa(b)(c)[d]", 10, "abcd", "abcd" }, |
| 73 | { "\\Aab(cab|cat)", 10, "abcab", "abcat" }, |
| 74 | { "\\Aab(cab|ca)x", 10, "abcabx", "abcax" }, |
| 75 | { "\\A(ab|x)(c|de)", 10, "abc", "xde" }, |
| 76 | { "\\A(ab|x)?(c|z)?", 10, "", "z" }, |
| 77 | { "\\A[^\\s\\S]", 10, "", "" }, |
| 78 | { "\\A(abc)+", 5, "abc", "abcac" }, |
| 79 | { "\\A(abc)+", 2, "ab", "ac" }, |
| 80 | { "\\A(abc)+", 1, "a", "b" }, |
| 81 | { "\\A[a\xC3\xA1]", 4, "a", "\xC3\xA1" }, |
| 82 | { "\\Aa*", 10, "", "ab" }, |
| 83 | |
| 84 | { "(?i)\\AAbcdef", 10, "ABCDEF", "abcdef" }, |
| 85 | { "(?i)\\Aabc(def|ghi)", 10, "ABCDEF", "abcghi" }, |
| 86 | { "(?i)\\Aa+hello", 10, "AA", "ahello" }, |
| 87 | { "(?i)\\Aa*hello", 10, "A", "hello" }, |
| 88 | { "(?i)\\Adef|abc", 10, "ABC", "def" }, |
| 89 | { "(?i)\\Aa(b)(c)[d]", 10, "ABCD", "abcd" }, |
| 90 | { "(?i)\\Aab(cab|cat)", 10, "ABCAB", "abcat" }, |
| 91 | { "(?i)\\Aab(cab|ca)x", 10, "ABCABX", "abcax" }, |
| 92 | { "(?i)\\A(ab|x)(c|de)", 10, "ABC", "xde" }, |
| 93 | { "(?i)\\A(ab|x)?(c|z)?", 10, "", "z" }, |
| 94 | { "(?i)\\A[^\\s\\S]", 10, "", "" }, |
| 95 | { "(?i)\\A(abc)+", 5, "ABC", "abcac" }, |
| 96 | { "(?i)\\A(abc)+", 2, "AB", "ac" }, |
| 97 | { "(?i)\\A(abc)+", 1, "A", "b" }, |
| 98 | { "(?i)\\A[a\xC3\xA1]", 4, "A", "\xC3\xA1" }, |
| 99 | { "(?i)\\Aa*", 10, "", "ab" }, |
| 100 | { "(?i)\\AA*", 10, "", "ab" }, |
| 101 | }; |
| 102 | |
| 103 | TEST(PossibleMatchRange, HandWritten) { |
| 104 | for (int i = 0; i < arraysize(tests); i++) { |
| 105 | for (int j = 0; j < 2; j++) { |
| 106 | const PrefixTest& t = tests[i]; |
| 107 | string min, max; |
| 108 | if (j == 0) { |
| 109 | LOG(INFO) << "Checking regexp=" << CEscape(t.regexp); |
| 110 | Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL); |
| 111 | CHECK(re); |
| 112 | Prog* prog = re->CompileToProg(0); |
| 113 | CHECK(prog); |
| 114 | CHECK(prog->PossibleMatchRange(&min, &max, t.maxlen)) |
| 115 | << " " << t.regexp; |
| 116 | delete prog; |
| 117 | re->Decref(); |
| 118 | } else { |
| 119 | CHECK(RE2(t.regexp).PossibleMatchRange(&min, &max, t.maxlen)); |
| 120 | } |
| 121 | EXPECT_EQ(t.min, min) << t.regexp; |
| 122 | EXPECT_EQ(t.max, max) << t.regexp; |
| 123 | } |
| 124 | } |
| 125 | } |
| 126 | |
| 127 | // Test cases where PossibleMatchRange should return false. |
| 128 | TEST(PossibleMatchRange, Failures) { |
| 129 | string min, max; |
| 130 | |
| 131 | // Fails because no room to write max. |
| 132 | EXPECT_FALSE(RE2("abc").PossibleMatchRange(&min, &max, 0)); |
| 133 | |
| 134 | // Fails because there is no max -- any non-empty string matches |
| 135 | // or begins a match. Have to use Latin-1 input, because there |
| 136 | // are no valid UTF-8 strings beginning with byte 0xFF. |
| 137 | EXPECT_FALSE(RE2("[\\s\\S]+", RE2::Latin1). |
| 138 | PossibleMatchRange(&min, &max, 10)) |
| 139 | << "min=" << CEscape(min) << ", max=" << CEscape(max); |
| 140 | EXPECT_FALSE(RE2("[\\0-\xFF]+", RE2::Latin1). |
| 141 | PossibleMatchRange(&min, &max, 10)) |
| 142 | << "min=" << CEscape(min) << ", max=" << CEscape(max); |
| 143 | EXPECT_FALSE(RE2(".+hello", RE2::Latin1). |
| 144 | PossibleMatchRange(&min, &max, 10)) |
| 145 | << "min=" << CEscape(min) << ", max=" << CEscape(max); |
| 146 | EXPECT_FALSE(RE2(".*hello", RE2::Latin1). |
| 147 | PossibleMatchRange(&min, &max, 10)) |
| 148 | << "min=" << CEscape(min) << ", max=" << CEscape(max); |
| 149 | EXPECT_FALSE(RE2(".*", RE2::Latin1). |
| 150 | PossibleMatchRange(&min, &max, 10)) |
| 151 | << "min=" << CEscape(min) << ", max=" << CEscape(max); |
| 152 | EXPECT_FALSE(RE2("\\C*"). |
| 153 | PossibleMatchRange(&min, &max, 10)) |
| 154 | << "min=" << CEscape(min) << ", max=" << CEscape(max); |
| 155 | |
| 156 | // Fails because it's a malformed regexp. |
| 157 | EXPECT_FALSE(RE2("*hello").PossibleMatchRange(&min, &max, 10)) |
| 158 | << "min=" << CEscape(min) << ", max=" << CEscape(max); |
| 159 | } |
| 160 | |
| 161 | // Exhaustive test: generate all regexps within parameters, |
| 162 | // then generate all strings of a given length over a given alphabet, |
| 163 | // then check that the prefix information agrees with whether |
| 164 | // the regexp matches each of the strings. |
| 165 | class PossibleMatchTester : public RegexpGenerator { |
| 166 | public: |
| 167 | PossibleMatchTester(int maxatoms, |
| 168 | int maxops, |
| 169 | const vector<string>& alphabet, |
| 170 | const vector<string>& ops, |
| 171 | int maxstrlen, |
| 172 | const vector<string>& stralphabet) |
| 173 | : RegexpGenerator(maxatoms, maxops, alphabet, ops), |
| 174 | strgen_(maxstrlen, stralphabet), |
| 175 | regexps_(0), tests_(0) { } |
| 176 | |
| 177 | int regexps() { return regexps_; } |
| 178 | int tests() { return tests_; } |
| 179 | |
| 180 | // Needed for RegexpGenerator interface. |
| 181 | void HandleRegexp(const string& regexp); |
| 182 | |
| 183 | private: |
| 184 | StringGenerator strgen_; |
| 185 | |
| 186 | int regexps_; // Number of HandleRegexp calls |
| 187 | int tests_; // Number of regexp tests. |
| 188 | |
| 189 | DISALLOW_EVIL_CONSTRUCTORS(PossibleMatchTester); |
| 190 | }; |
| 191 | |
| 192 | // Processes a single generated regexp. |
| 193 | // Checks that all accepted strings agree with the prefix range. |
| 194 | void PossibleMatchTester::HandleRegexp(const string& regexp) { |
| 195 | regexps_++; |
| 196 | |
| 197 | VLOG(3) << CEscape(regexp); |
| 198 | |
| 199 | RE2 re(regexp, RE2::Latin1); |
| 200 | CHECK_EQ(re.error(), ""); |
| 201 | |
| 202 | string min, max; |
| 203 | if(!re.PossibleMatchRange(&min, &max, 10)) { |
| 204 | // There's no good max for "\\C*". Can't use strcmp |
| 205 | // because sometimes it gets embedded in more |
| 206 | // complicated expressions. |
| 207 | if(strstr(regexp.c_str(), "\\C*")) |
| 208 | return; |
| 209 | LOG(QFATAL) << "PossibleMatchRange failed on: " << CEscape(regexp); |
| 210 | } |
| 211 | |
| 212 | strgen_.Reset(); |
| 213 | while (strgen_.HasNext()) { |
| 214 | const StringPiece& s = strgen_.Next(); |
| 215 | tests_++; |
| 216 | if (!RE2::FullMatch(s, re)) |
| 217 | continue; |
| 218 | CHECK_GE(s, min) << " regexp: " << regexp << " max: " << max; |
| 219 | CHECK_LE(s, max) << " regexp: " << regexp << " min: " << min; |
| 220 | } |
| 221 | } |
| 222 | |
| 223 | TEST(PossibleMatchRange, Exhaustive) { |
| 224 | int natom = 3; |
| 225 | int noperator = 3; |
| 226 | int stringlen = 5; |
| 227 | if (DEBUG_MODE) { |
| 228 | natom = 2; |
| 229 | noperator = 3; |
| 230 | stringlen = 3; |
| 231 | } |
| 232 | PossibleMatchTester t(natom, noperator, Split(" ", "a b [0-9]"), |
| 233 | RegexpGenerator::EgrepOps(), |
| 234 | stringlen, Explode("ab4")); |
| 235 | t.Generate(); |
| 236 | LOG(INFO) << t.regexps() << " regexps, " |
| 237 | << t.tests() << " tests"; |
| 238 | } |
| 239 | |
| 240 | } // namespace re2 |