Alexander Gutkin | 0d4c523 | 2013-02-28 13:47:27 +0000 | [diff] [blame] | 1 | // Copyright 2008 The RE2 Authors. All Rights Reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | // Exhaustive testing of regular expression matching. |
| 6 | |
| 7 | #include "util/test.h" |
| 8 | #include "re2/testing/exhaustive_tester.h" |
| 9 | |
| 10 | namespace re2 { |
| 11 | |
| 12 | // Test simple character classes by themselves. |
| 13 | TEST(CharacterClasses, Exhaustive) { |
| 14 | vector<string> atoms = Split(" ", |
| 15 | "[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b ."); |
| 16 | ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(), |
| 17 | 5, Explode("ab"), "", ""); |
| 18 | } |
| 19 | |
| 20 | // Test simple character classes inside a___b (for example, a[a]b). |
| 21 | TEST(CharacterClasses, ExhaustiveAB) { |
| 22 | vector<string> atoms = Split(" ", |
| 23 | "[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b ."); |
| 24 | ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(), |
| 25 | 5, Explode("ab"), "a%sb", ""); |
| 26 | } |
| 27 | |
| 28 | // Returns UTF8 for Rune r |
| 29 | static string UTF8(Rune r) { |
| 30 | char buf[UTFmax+1]; |
| 31 | buf[runetochar(buf, &r)] = 0; |
| 32 | return string(buf); |
| 33 | } |
| 34 | |
| 35 | // Returns a vector of "interesting" UTF8 characters. |
| 36 | // Unicode is now too big to just return all of them, |
| 37 | // so UTF8Characters return a set likely to be good test cases. |
| 38 | static const vector<string>& InterestingUTF8() { |
| 39 | static bool init; |
| 40 | static vector<string> v; |
| 41 | |
| 42 | if (init) |
| 43 | return v; |
| 44 | |
| 45 | init = true; |
| 46 | // All the Latin1 equivalents are interesting. |
| 47 | for (int i = 1; i < 256; i++) |
| 48 | v.push_back(UTF8(i)); |
| 49 | |
| 50 | // After that, the codes near bit boundaries are |
| 51 | // interesting, because they span byte sequence lengths. |
| 52 | for (int j = 0; j < 8; j++) |
| 53 | v.push_back(UTF8(256 + j)); |
| 54 | for (int i = 512; i < Runemax; i <<= 1) |
| 55 | for (int j = -8; j < 8; j++) |
| 56 | v.push_back(UTF8(i + j)); |
| 57 | |
| 58 | // The codes near Runemax, including Runemax itself, are interesting. |
| 59 | for (int j = -8; j <= 0; j++) |
| 60 | v.push_back(UTF8(Runemax + j)); |
| 61 | |
| 62 | return v; |
| 63 | } |
| 64 | |
| 65 | // Test interesting UTF-8 characters against character classes. |
| 66 | TEST(InterestingUTF8, SingleOps) { |
| 67 | vector<string> atoms = Split(" ", |
| 68 | ". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B " |
| 69 | "[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] " |
| 70 | "[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] " |
| 71 | "[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]"); |
| 72 | vector<string> ops; // no ops |
| 73 | ExhaustiveTest(1, 0, atoms, ops, |
| 74 | 1, InterestingUTF8(), "", ""); |
| 75 | } |
| 76 | |
| 77 | // Test interesting UTF-8 characters against character classes, |
| 78 | // but wrap everything inside AB. |
| 79 | TEST(InterestingUTF8, AB) { |
| 80 | vector<string> atoms = Split(" ", |
| 81 | ". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B " |
| 82 | "[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] " |
| 83 | "[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] " |
| 84 | "[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]"); |
| 85 | vector<string> ops; // no ops |
| 86 | vector<string> alpha = InterestingUTF8(); |
| 87 | for (int i = 0; i < alpha.size(); i++) |
| 88 | alpha[i] = "a" + alpha[i] + "b"; |
| 89 | ExhaustiveTest(1, 0, atoms, ops, |
| 90 | 1, alpha, "a%sb", ""); |
| 91 | } |
| 92 | |
| 93 | } // namespace re2 |
| 94 | |