Alexander Gutkin | 0d4c523 | 2013-02-28 13:47:27 +0000 | [diff] [blame] | 1 | // Copyright 2008 The RE2 Authors. All Rights Reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | // Exhaustive testing of regular expression matching. |
| 6 | |
| 7 | #include "util/test.h" |
| 8 | #include "re2/re2.h" |
| 9 | #include "re2/testing/exhaustive_tester.h" |
| 10 | |
| 11 | DECLARE_string(regexp_engines); |
| 12 | |
| 13 | namespace re2 { |
| 14 | |
| 15 | // Test empty string matches (aka "(?:)") |
| 16 | TEST(EmptyString, Exhaustive) { |
| 17 | ExhaustiveTest(2, 2, Split(" ", "(?:) a"), |
| 18 | RegexpGenerator::EgrepOps(), |
| 19 | 5, Split("", "ab"), "", ""); |
| 20 | } |
| 21 | |
| 22 | // Test escaped versions of regexp syntax. |
| 23 | TEST(Punctuation, Literals) { |
| 24 | vector<string> alphabet = Explode("()*+?{}[]\\^$."); |
| 25 | vector<string> escaped = alphabet; |
| 26 | for (int i = 0; i < escaped.size(); i++) |
| 27 | escaped[i] = "\\" + escaped[i]; |
| 28 | ExhaustiveTest(1, 1, escaped, RegexpGenerator::EgrepOps(), |
| 29 | 2, alphabet, "", ""); |
| 30 | } |
| 31 | |
| 32 | // Test ^ $ . \A \z in presence of line endings. |
| 33 | // Have to wrap the empty-width ones in (?:) so that |
| 34 | // they can be repeated -- PCRE rejects ^* but allows (?:^)* |
| 35 | TEST(LineEnds, Exhaustive) { |
| 36 | ExhaustiveTest(2, 2, Split(" ", "(?:^) (?:$) . a \\n (?:\\A) (?:\\z)"), |
| 37 | RegexpGenerator::EgrepOps(), |
| 38 | 4, Explode("ab\n"), "", ""); |
| 39 | } |
| 40 | |
| 41 | // Test what does and does not match \n. |
| 42 | // This would be a good test, except that PCRE seems to have a bug: |
| 43 | // in single-byte character set mode (the default), |
| 44 | // [^a] matches \n, but in UTF-8 mode it does not. |
| 45 | // So when we run the test, the tester complains that |
| 46 | // we don't agree with PCRE, but it's PCRE that is at fault. |
| 47 | // For what it's worth, Perl gets this right (matches |
| 48 | // regardless of whether UTF-8 input is selected): |
| 49 | // |
| 50 | // #!/usr/bin/perl |
| 51 | // use POSIX qw(locale_h); |
| 52 | // print "matches in latin1\n" if "\n" =~ /[^a]/; |
| 53 | // setlocale("en_US.utf8"); |
| 54 | // print "matches in utf8\n" if "\n" =~ /[^a]/; |
| 55 | // |
| 56 | // The rule chosen for RE2 is that by default, like Perl, |
| 57 | // dot does not match \n but negated character classes [^a] do. |
| 58 | // (?s) will allow dot to match \n; there is no way in RE2 |
| 59 | // to stop [^a] from matching \n, though the underlying library |
| 60 | // provides a mechanism, and RE2 could add new syntax if needed. |
| 61 | // |
| 62 | // TEST(Newlines, Exhaustive) { |
| 63 | // vector<string> empty_vector; |
| 64 | // ExhaustiveTest(1, 1, Split(" ", "\\n . a [^a]"), |
| 65 | // RegexpGenerator::EgrepOps(), |
| 66 | // 4, Explode("a\n"), ""); |
| 67 | // } |
| 68 | |
| 69 | } // namespace re2 |
| 70 | |