blob: d05f138edfa5e3ada262e070af3879bbc95118be [file] [log] [blame]
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -07001// These are tests specifically crafted for regexes that can match arbitrary
2// bytes.
3
4// A silly wrapper to make it possible to write and match raw bytes.
5struct R<'a>(&'a [u8]);
6impl<'a> R<'a> {
7 fn as_bytes(&self) -> &'a [u8] {
8 self.0
9 }
10}
11
12mat!(word_boundary, r"(?-u) \b", " δ", None);
13#[cfg(feature = "unicode-perl")]
14mat!(word_boundary_unicode, r" \b", " δ", Some((0, 1)));
15mat!(word_not_boundary, r"(?-u) \B", " δ", Some((0, 1)));
16#[cfg(feature = "unicode-perl")]
17mat!(word_not_boundary_unicode, r" \B", " δ", None);
18
19mat!(perl_w_ascii, r"(?-u)\w+", "aδ", Some((0, 1)));
20#[cfg(feature = "unicode-perl")]
21mat!(perl_w_unicode, r"\w+", "aδ", Some((0, 3)));
22mat!(perl_d_ascii, r"(?-u)\d+", "1२३9", Some((0, 1)));
23#[cfg(feature = "unicode-perl")]
24mat!(perl_d_unicode, r"\d+", "1२३9", Some((0, 8)));
25mat!(perl_s_ascii, r"(?-u)\s+", " \u{1680}", Some((0, 1)));
26#[cfg(feature = "unicode-perl")]
27mat!(perl_s_unicode, r"\s+", " \u{1680}", Some((0, 4)));
28
29// The first `(.+)` matches two Unicode codepoints, but can't match the 5th
30// byte, which isn't valid UTF-8. The second (byte based) `(.+)` takes over and
31// matches.
32mat!(
33 mixed1,
34 r"(.+)(?-u)(.+)",
35 R(b"\xCE\x93\xCE\x94\xFF"),
36 Some((0, 5)),
37 Some((0, 4)),
38 Some((4, 5))
39);
40
41mat!(case_ascii_one, r"(?i-u)a", "A", Some((0, 1)));
42mat!(case_ascii_class, r"(?i-u)[a-z]+", "AaAaA", Some((0, 5)));
43#[cfg(feature = "unicode-case")]
44mat!(case_unicode, r"(?i)[a-z]+", "aA\u{212A}aA", Some((0, 7)));
45mat!(case_not_unicode, r"(?i-u)[a-z]+", "aA\u{212A}aA", Some((0, 2)));
46
47mat!(negate_unicode, r"[^a]", "δ", Some((0, 2)));
48mat!(negate_not_unicode, r"(?-u)[^a]", "δ", Some((0, 1)));
49
50// This doesn't match in a normal Unicode regex because the implicit preceding
51// `.*?` is Unicode aware.
52mat!(dotstar_prefix_not_unicode1, r"(?-u)a", R(b"\xFFa"), Some((1, 2)));
53mat!(dotstar_prefix_not_unicode2, r"a", R(b"\xFFa"), Some((1, 2)));
54
55// Have fun with null bytes.
56mat!(
57 null_bytes,
58 r"(?-u)(?P<cstr>[^\x00]+)\x00",
59 R(b"foo\x00"),
60 Some((0, 4)),
61 Some((0, 3))
62);
63
64// Test that lookahead operators work properly in the face of invalid UTF-8.
65// See: https://github.com/rust-lang/regex/issues/277
66matiter!(
67 invalidutf8_anchor1,
68 r"(?-u)\xcc?^",
69 R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"),
70 (0, 0)
71);
72matiter!(
73 invalidutf8_anchor2,
74 r"(?-u)^\xf7|4\xff\d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########\[] #####\x80\S7|$",
75 R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"),
76 (22, 22)
77);
78matiter!(
79 invalidutf8_anchor3,
80 r"(?-u)^|ddp\xff\xffdddddlQd@\x80",
81 R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"),
82 (0, 0)
83);
84
85// See https://github.com/rust-lang/regex/issues/303
86#[test]
87fn negated_full_byte_range() {
88 assert!(::regex::bytes::Regex::new(r#"(?-u)[^\x00-\xff]"#).is_err());
89}
90
91matiter!(word_boundary_ascii1, r"(?-u:\B)x(?-u:\B)", "áxβ");
92matiter!(
93 word_boundary_ascii2,
94 r"(?-u:\B)",
95 "0\u{7EF5E}",
96 (2, 2),
97 (3, 3),
98 (4, 4),
99 (5, 5)
100);
101
102// See: https://github.com/rust-lang/regex/issues/264
103mat!(ascii_boundary_no_capture, r"(?-u)\B", "\u{28f3e}", Some((0, 0)));
104mat!(ascii_boundary_capture, r"(?-u)(\B)", "\u{28f3e}", Some((0, 0)));
105
106// See: https://github.com/rust-lang/regex/issues/271
107mat!(end_not_wb, r"$(?-u:\B)", "\u{5c124}\u{b576c}", Some((8, 8)));