Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 1 | // These are tests specifically crafted for regexes that can match arbitrary |
| 2 | // bytes. |
| 3 | |
| 4 | // A silly wrapper to make it possible to write and match raw bytes. |
| 5 | struct R<'a>(&'a [u8]); |
| 6 | impl<'a> R<'a> { |
| 7 | fn as_bytes(&self) -> &'a [u8] { |
| 8 | self.0 |
| 9 | } |
| 10 | } |
| 11 | |
| 12 | mat!(word_boundary, r"(?-u) \b", " δ", None); |
| 13 | #[cfg(feature = "unicode-perl")] |
| 14 | mat!(word_boundary_unicode, r" \b", " δ", Some((0, 1))); |
| 15 | mat!(word_not_boundary, r"(?-u) \B", " δ", Some((0, 1))); |
| 16 | #[cfg(feature = "unicode-perl")] |
| 17 | mat!(word_not_boundary_unicode, r" \B", " δ", None); |
| 18 | |
| 19 | mat!(perl_w_ascii, r"(?-u)\w+", "aδ", Some((0, 1))); |
| 20 | #[cfg(feature = "unicode-perl")] |
| 21 | mat!(perl_w_unicode, r"\w+", "aδ", Some((0, 3))); |
| 22 | mat!(perl_d_ascii, r"(?-u)\d+", "1२३9", Some((0, 1))); |
| 23 | #[cfg(feature = "unicode-perl")] |
| 24 | mat!(perl_d_unicode, r"\d+", "1२३9", Some((0, 8))); |
| 25 | mat!(perl_s_ascii, r"(?-u)\s+", " \u{1680}", Some((0, 1))); |
| 26 | #[cfg(feature = "unicode-perl")] |
| 27 | mat!(perl_s_unicode, r"\s+", " \u{1680}", Some((0, 4))); |
| 28 | |
| 29 | // The first `(.+)` matches two Unicode codepoints, but can't match the 5th |
| 30 | // byte, which isn't valid UTF-8. The second (byte based) `(.+)` takes over and |
| 31 | // matches. |
| 32 | mat!( |
| 33 | mixed1, |
| 34 | r"(.+)(?-u)(.+)", |
| 35 | R(b"\xCE\x93\xCE\x94\xFF"), |
| 36 | Some((0, 5)), |
| 37 | Some((0, 4)), |
| 38 | Some((4, 5)) |
| 39 | ); |
| 40 | |
| 41 | mat!(case_ascii_one, r"(?i-u)a", "A", Some((0, 1))); |
| 42 | mat!(case_ascii_class, r"(?i-u)[a-z]+", "AaAaA", Some((0, 5))); |
| 43 | #[cfg(feature = "unicode-case")] |
| 44 | mat!(case_unicode, r"(?i)[a-z]+", "aA\u{212A}aA", Some((0, 7))); |
| 45 | mat!(case_not_unicode, r"(?i-u)[a-z]+", "aA\u{212A}aA", Some((0, 2))); |
| 46 | |
| 47 | mat!(negate_unicode, r"[^a]", "δ", Some((0, 2))); |
| 48 | mat!(negate_not_unicode, r"(?-u)[^a]", "δ", Some((0, 1))); |
| 49 | |
| 50 | // This doesn't match in a normal Unicode regex because the implicit preceding |
| 51 | // `.*?` is Unicode aware. |
| 52 | mat!(dotstar_prefix_not_unicode1, r"(?-u)a", R(b"\xFFa"), Some((1, 2))); |
| 53 | mat!(dotstar_prefix_not_unicode2, r"a", R(b"\xFFa"), Some((1, 2))); |
| 54 | |
| 55 | // Have fun with null bytes. |
| 56 | mat!( |
| 57 | null_bytes, |
| 58 | r"(?-u)(?P<cstr>[^\x00]+)\x00", |
| 59 | R(b"foo\x00"), |
| 60 | Some((0, 4)), |
| 61 | Some((0, 3)) |
| 62 | ); |
| 63 | |
| 64 | // Test that lookahead operators work properly in the face of invalid UTF-8. |
| 65 | // See: https://github.com/rust-lang/regex/issues/277 |
| 66 | matiter!( |
| 67 | invalidutf8_anchor1, |
| 68 | r"(?-u)\xcc?^", |
| 69 | R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"), |
| 70 | (0, 0) |
| 71 | ); |
| 72 | matiter!( |
| 73 | invalidutf8_anchor2, |
| 74 | r"(?-u)^\xf7|4\xff\d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########\[] #####\x80\S7|$", |
| 75 | R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"), |
| 76 | (22, 22) |
| 77 | ); |
| 78 | matiter!( |
| 79 | invalidutf8_anchor3, |
| 80 | r"(?-u)^|ddp\xff\xffdddddlQd@\x80", |
| 81 | R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"), |
| 82 | (0, 0) |
| 83 | ); |
| 84 | |
| 85 | // See https://github.com/rust-lang/regex/issues/303 |
| 86 | #[test] |
| 87 | fn negated_full_byte_range() { |
| 88 | assert!(::regex::bytes::Regex::new(r#"(?-u)[^\x00-\xff]"#).is_err()); |
| 89 | } |
| 90 | |
| 91 | matiter!(word_boundary_ascii1, r"(?-u:\B)x(?-u:\B)", "áxβ"); |
| 92 | matiter!( |
| 93 | word_boundary_ascii2, |
| 94 | r"(?-u:\B)", |
| 95 | "0\u{7EF5E}", |
| 96 | (2, 2), |
| 97 | (3, 3), |
| 98 | (4, 4), |
| 99 | (5, 5) |
| 100 | ); |
| 101 | |
| 102 | // See: https://github.com/rust-lang/regex/issues/264 |
| 103 | mat!(ascii_boundary_no_capture, r"(?-u)\B", "\u{28f3e}", Some((0, 0))); |
| 104 | mat!(ascii_boundary_capture, r"(?-u)(\B)", "\u{28f3e}", Some((0, 0))); |
| 105 | |
| 106 | // See: https://github.com/rust-lang/regex/issues/271 |
| 107 | mat!(end_not_wb, r"$(?-u:\B)", "\u{5c124}\u{b576c}", Some((8, 8))); |