Alexander Gutkin | 0d4c523 | 2013-02-28 13:47:27 +0000 | [diff] [blame] | 1 | // Copyright 2009 The RE2 Authors. All Rights Reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | #include "util/test.h" |
| 6 | #include "re2/filtered_re2.h" |
| 7 | #include "re2/re2.h" |
| 8 | |
| 9 | DECLARE_int32(filtered_re2_min_atom_len); // From prefilter_tree.cc |
| 10 | |
| 11 | namespace re2 { |
| 12 | |
| 13 | struct FilterTestVars { |
| 14 | vector<string> atoms; |
| 15 | vector<int> atom_indices; |
| 16 | vector<int> matches; |
| 17 | RE2::Options opts; |
| 18 | FilteredRE2 f; |
| 19 | }; |
| 20 | |
| 21 | TEST(FilteredRE2Test, EmptyTest) { |
| 22 | FilterTestVars v; |
| 23 | v.f.AllMatches("foo", v.atom_indices, &v.matches); |
| 24 | EXPECT_EQ(0, v.matches.size()); |
| 25 | } |
| 26 | |
| 27 | TEST(FilteredRE2Test, SmallOrTest) { |
| 28 | FLAGS_filtered_re2_min_atom_len = 4; |
| 29 | |
| 30 | FilterTestVars v; |
| 31 | int id; |
| 32 | v.f.Add("(foo|bar)", v.opts, &id); |
| 33 | |
| 34 | v.f.Compile(&v.atoms); |
| 35 | EXPECT_EQ(0, v.atoms.size()); |
| 36 | |
| 37 | v.f.AllMatches("lemurs bar", v.atom_indices, &v.matches); |
| 38 | EXPECT_EQ(1, v.matches.size()); |
| 39 | EXPECT_EQ(id, v.matches[0]); |
| 40 | } |
| 41 | |
| 42 | TEST(FilteredRE2Test, SmallLatinTest) { |
| 43 | FLAGS_filtered_re2_min_atom_len = 3; |
| 44 | FilterTestVars v; |
| 45 | int id; |
| 46 | |
| 47 | v.opts.set_utf8(false); |
| 48 | v.f.Add("\xde\xadQ\xbe\xef", v.opts, &id); |
| 49 | v.f.Compile(&v.atoms); |
| 50 | EXPECT_EQ(1, v.atoms.size()); |
| 51 | EXPECT_EQ(v.atoms[0], "\xde\xadq\xbe\xef"); |
| 52 | |
| 53 | v.atom_indices.push_back(0); |
| 54 | v.f.AllMatches("foo\xde\xadQ\xbe\xeflemur", v.atom_indices, &v.matches); |
| 55 | EXPECT_EQ(1, v.matches.size()); |
| 56 | EXPECT_EQ(id, v.matches[0]); |
| 57 | } |
| 58 | |
| 59 | struct AtomTest { |
| 60 | const char* testname; |
| 61 | // If any test needs more than this many regexps or atoms, increase |
| 62 | // the size of the corresponding array. |
| 63 | const char* regexps[20]; |
| 64 | const char* atoms[20]; |
| 65 | }; |
| 66 | |
| 67 | AtomTest atom_tests[] = { |
| 68 | { |
| 69 | // This test checks to make sure empty patterns are allowed. |
| 70 | "CheckEmptyPattern", |
| 71 | {""}, |
| 72 | {} |
| 73 | }, { |
| 74 | // This test checks that all atoms of length greater than min length |
| 75 | // are found, and no atoms that are of smaller length are found. |
| 76 | "AllAtomsGtMinLengthFound", { |
| 77 | "(abc123|def456|ghi789).*mnop[x-z]+", |
| 78 | "abc..yyy..zz", |
| 79 | "mnmnpp[a-z]+PPP" |
| 80 | }, { |
| 81 | "abc123", |
| 82 | "def456", |
| 83 | "ghi789", |
| 84 | "mnop", |
| 85 | "abc", |
| 86 | "yyy", |
| 87 | "mnmnpp", |
| 88 | "ppp" |
| 89 | } |
| 90 | }, { |
| 91 | // Test to make sure that any atoms that have another atom as a |
| 92 | // substring in an OR are removed; that is, only the shortest |
| 93 | // substring is kept. |
| 94 | "SubstrAtomRemovesSuperStrInOr", { |
| 95 | "(abc123|abc|ghi789|abc1234).*[x-z]+", |
| 96 | "abcd..yyy..yyyzzz", |
| 97 | "mnmnpp[a-z]+PPP" |
| 98 | }, { |
| 99 | "abc", |
| 100 | "ghi789", |
| 101 | "abcd", |
| 102 | "yyy", |
| 103 | "yyyzzz", |
| 104 | "mnmnpp", |
| 105 | "ppp" |
| 106 | } |
| 107 | }, { |
| 108 | // Test character class expansion. |
| 109 | "CharClassExpansion", { |
| 110 | "m[a-c][d-f]n.*[x-z]+", |
| 111 | "[x-y]bcde[ab]" |
| 112 | }, { |
| 113 | "madn", "maen", "mafn", |
| 114 | "mbdn", "mben", "mbfn", |
| 115 | "mcdn", "mcen", "mcfn", |
| 116 | "xbcdea", "xbcdeb", |
| 117 | "ybcdea", "ybcdeb" |
| 118 | } |
| 119 | }, { |
| 120 | // Test upper/lower of non-ASCII. |
| 121 | "UnicodeLower", { |
| 122 | "(?i)ΔδΠϖπΣςσ", |
| 123 | "ΛΜΝΟΠ", |
| 124 | "ψρστυ", |
| 125 | }, { |
| 126 | "δδπππσσσ", |
| 127 | "λμνοπ", |
| 128 | "ψρστυ", |
| 129 | }, |
| 130 | }, |
| 131 | }; |
| 132 | |
| 133 | void AddRegexpsAndCompile(const char* regexps[], |
| 134 | int n, |
| 135 | struct FilterTestVars* v) { |
| 136 | for (int i = 0; i < n; i++) { |
| 137 | int id; |
| 138 | v->f.Add(regexps[i], v->opts, &id); |
| 139 | } |
| 140 | v->f.Compile(&v->atoms); |
| 141 | } |
| 142 | |
| 143 | bool CheckExpectedAtoms(const char* atoms[], |
| 144 | int n, |
| 145 | const char* testname, |
| 146 | struct FilterTestVars* v) { |
| 147 | vector<string> expected; |
| 148 | for (int i = 0; i < n; i++) |
| 149 | expected.push_back(atoms[i]); |
| 150 | |
| 151 | bool pass = expected.size() == v->atoms.size(); |
| 152 | |
| 153 | sort(v->atoms.begin(), v->atoms.end()); |
| 154 | sort(expected.begin(), expected.end()); |
| 155 | for (int i = 0; pass && i < n; i++) |
| 156 | pass = pass && expected[i] == v->atoms[i]; |
| 157 | |
| 158 | if (!pass) { |
| 159 | LOG(WARNING) << "Failed " << testname; |
| 160 | LOG(WARNING) << "Expected #atoms = " << expected.size(); |
| 161 | for (int i = 0; i < expected.size(); i++) |
| 162 | LOG(WARNING) << expected[i]; |
| 163 | LOG(WARNING) << "Found #atoms = " << v->atoms.size(); |
| 164 | for (int i = 0; i < v->atoms.size(); i++) |
| 165 | LOG(WARNING) << v->atoms[i]; |
| 166 | } |
| 167 | |
| 168 | return pass; |
| 169 | } |
| 170 | |
| 171 | TEST(FilteredRE2Test, AtomTests) { |
| 172 | FLAGS_filtered_re2_min_atom_len = 3; |
| 173 | |
| 174 | int nfail = 0; |
| 175 | for (int i = 0; i < arraysize(atom_tests); i++) { |
| 176 | FilterTestVars v; |
| 177 | AtomTest* t = &atom_tests[i]; |
| 178 | int natom, nregexp; |
| 179 | for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++) |
| 180 | if (t->regexps[nregexp] == NULL) |
| 181 | break; |
| 182 | for (natom = 0; natom < arraysize(t->atoms); natom++) |
| 183 | if (t->atoms[natom] == NULL) |
| 184 | break; |
| 185 | AddRegexpsAndCompile(t->regexps, nregexp, &v); |
| 186 | if (!CheckExpectedAtoms(t->atoms, natom, t->testname, &v)) |
| 187 | nfail++; |
| 188 | } |
| 189 | EXPECT_EQ(0, nfail); |
| 190 | } |
| 191 | |
| 192 | void FindAtomIndices(const vector<string> atoms, |
| 193 | const vector<string> matched_atoms, |
| 194 | vector<int>* atom_indices) { |
| 195 | atom_indices->clear(); |
| 196 | for (int i = 0; i < matched_atoms.size(); i++) { |
| 197 | int j = 0; |
| 198 | for (; j < atoms.size(); j++) { |
| 199 | if (matched_atoms[i] == atoms[j]) { |
| 200 | atom_indices->push_back(j); |
| 201 | break; |
| 202 | } |
| 203 | EXPECT_LT(j, atoms.size()); |
| 204 | } |
| 205 | } |
| 206 | } |
| 207 | |
| 208 | TEST(FilteredRE2Test, MatchEmptyPattern) { |
| 209 | FLAGS_filtered_re2_min_atom_len = 3; |
| 210 | |
| 211 | FilterTestVars v; |
| 212 | AtomTest* t = &atom_tests[0]; |
| 213 | // We are using the regexps used in one of the atom tests |
| 214 | // for this test. Adding the EXPECT here to make sure |
| 215 | // the index we use for the test is for the correct test. |
| 216 | EXPECT_EQ("CheckEmptyPattern", string(t->testname)); |
| 217 | int nregexp; |
| 218 | for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++) |
| 219 | if (t->regexps[nregexp] == NULL) |
| 220 | break; |
| 221 | AddRegexpsAndCompile(t->regexps, nregexp, &v); |
| 222 | string text = "0123"; |
| 223 | vector<int> atom_ids; |
| 224 | vector<int> matching_regexps; |
| 225 | EXPECT_EQ(0, v.f.FirstMatch(text, atom_ids)); |
| 226 | } |
| 227 | |
| 228 | TEST(FilteredRE2Test, MatchTests) { |
| 229 | FLAGS_filtered_re2_min_atom_len = 3; |
| 230 | |
| 231 | FilterTestVars v; |
| 232 | AtomTest* t = &atom_tests[2]; |
| 233 | // We are using the regexps used in one of the atom tests |
| 234 | // for this test. |
| 235 | EXPECT_EQ("SubstrAtomRemovesSuperStrInOr", string(t->testname)); |
| 236 | int nregexp; |
| 237 | for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++) |
| 238 | if (t->regexps[nregexp] == NULL) |
| 239 | break; |
| 240 | AddRegexpsAndCompile(t->regexps, nregexp, &v); |
| 241 | |
| 242 | string text = "abc121212xyz"; |
| 243 | // atoms = abc |
| 244 | vector<int> atom_ids; |
| 245 | vector<string> atoms; |
| 246 | atoms.push_back("abc"); |
| 247 | FindAtomIndices(v.atoms, atoms, &atom_ids); |
| 248 | vector<int> matching_regexps; |
| 249 | v.f.AllMatches(text, atom_ids, &matching_regexps); |
| 250 | EXPECT_EQ(1, matching_regexps.size()); |
| 251 | |
| 252 | text = "abc12312yyyzzz"; |
| 253 | atoms.clear(); |
| 254 | atoms.push_back("abc"); |
| 255 | atoms.push_back("yyy"); |
| 256 | atoms.push_back("yyyzzz"); |
| 257 | FindAtomIndices(v.atoms, atoms, &atom_ids); |
| 258 | v.f.AllMatches(text, atom_ids, &matching_regexps); |
| 259 | EXPECT_EQ(1, matching_regexps.size()); |
| 260 | |
| 261 | text = "abcd12yyy32yyyzzz"; |
| 262 | atoms.clear(); |
| 263 | atoms.push_back("abc"); |
| 264 | atoms.push_back("abcd"); |
| 265 | atoms.push_back("yyy"); |
| 266 | atoms.push_back("yyyzzz"); |
| 267 | FindAtomIndices(v.atoms, atoms, &atom_ids); |
| 268 | LOG(INFO) << "S: " << atom_ids.size(); |
| 269 | for (int i = 0; i < atom_ids.size(); i++) |
| 270 | LOG(INFO) << "i: " << i << " : " << atom_ids[i]; |
| 271 | v.f.AllMatches(text, atom_ids, &matching_regexps); |
| 272 | EXPECT_EQ(2, matching_regexps.size()); |
| 273 | } |
| 274 | |
| 275 | } // namespace re2 |