Blame - tests/bytes.rs - platform/external/rust/crates/regex

blob: d05f138edfa5e3ada262e070af3879bbc95118be [file] [log] [blame]

Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	1	// These are tests specifically crafted for regexes that can match arbitrary
				2	// bytes.
				3
				4	// A silly wrapper to make it possible to write and match raw bytes.
				5	struct R<'a>(&'a [u8]);
				6	impl<'a> R<'a> {
				7	fn as_bytes(&self) -> &'a [u8] {
				8	self.0
				9	}
				10	}
				11
				12	mat!(word_boundary, r"(?-u) \b", " δ", None);
				13	#[cfg(feature = "unicode-perl")]
				14	mat!(word_boundary_unicode, r" \b", " δ", Some((0, 1)));
				15	mat!(word_not_boundary, r"(?-u) \B", " δ", Some((0, 1)));
				16	#[cfg(feature = "unicode-perl")]
				17	mat!(word_not_boundary_unicode, r" \B", " δ", None);
				18
				19	mat!(perl_w_ascii, r"(?-u)\w+", "aδ", Some((0, 1)));
				20	#[cfg(feature = "unicode-perl")]
				21	mat!(perl_w_unicode, r"\w+", "aδ", Some((0, 3)));
				22	mat!(perl_d_ascii, r"(?-u)\d+", "1२३9", Some((0, 1)));
				23	#[cfg(feature = "unicode-perl")]
				24	mat!(perl_d_unicode, r"\d+", "1२३9", Some((0, 8)));
				25	mat!(perl_s_ascii, r"(?-u)\s+", " \u{1680}", Some((0, 1)));
				26	#[cfg(feature = "unicode-perl")]
				27	mat!(perl_s_unicode, r"\s+", " \u{1680}", Some((0, 4)));
				28
				29	// The first `(.+)` matches two Unicode codepoints, but can't match the 5th
				30	// byte, which isn't valid UTF-8. The second (byte based) `(.+)` takes over and
				31	// matches.
				32	mat!(
				33	mixed1,
				34	r"(.+)(?-u)(.+)",
				35	R(b"\xCE\x93\xCE\x94\xFF"),
				36	Some((0, 5)),
				37	Some((0, 4)),
				38	Some((4, 5))
				39	);
				40
				41	mat!(case_ascii_one, r"(?i-u)a", "A", Some((0, 1)));
				42	mat!(case_ascii_class, r"(?i-u)[a-z]+", "AaAaA", Some((0, 5)));
				43	#[cfg(feature = "unicode-case")]
				44	mat!(case_unicode, r"(?i)[a-z]+", "aA\u{212A}aA", Some((0, 7)));
				45	mat!(case_not_unicode, r"(?i-u)[a-z]+", "aA\u{212A}aA", Some((0, 2)));
				46
				47	mat!(negate_unicode, r"[^a]", "δ", Some((0, 2)));
				48	mat!(negate_not_unicode, r"(?-u)[^a]", "δ", Some((0, 1)));
				49
				50	// This doesn't match in a normal Unicode regex because the implicit preceding
				51	// `.*?` is Unicode aware.
				52	mat!(dotstar_prefix_not_unicode1, r"(?-u)a", R(b"\xFFa"), Some((1, 2)));
				53	mat!(dotstar_prefix_not_unicode2, r"a", R(b"\xFFa"), Some((1, 2)));
				54
				55	// Have fun with null bytes.
				56	mat!(
				57	null_bytes,
				58	r"(?-u)(?P<cstr>[^\x00]+)\x00",
				59	R(b"foo\x00"),
				60	Some((0, 4)),
				61	Some((0, 3))
				62	);
				63
				64	// Test that lookahead operators work properly in the face of invalid UTF-8.
				65	// See: https://github.com/rust-lang/regex/issues/277
				66	matiter!(
				67	invalidutf8_anchor1,
				68	r"(?-u)\xcc?^",
				69	R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"),
				70	(0, 0)
				71	);
				72	matiter!(
				73	invalidutf8_anchor2,
				74	r"(?-u)^\xf7\|4\xff\d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########\[] #####\x80\S7\|$",
				75	R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"),
				76	(22, 22)
				77	);
				78	matiter!(
				79	invalidutf8_anchor3,
				80	r"(?-u)^\|ddp\xff\xffdddddlQd@\x80",
				81	R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"),
				82	(0, 0)
				83	);
				84
				85	// See https://github.com/rust-lang/regex/issues/303
				86	#[test]
				87	fn negated_full_byte_range() {
				88	assert!(::regex::bytes::Regex::new(r#"(?-u)[^\x00-\xff]"#).is_err());
				89	}
				90
				91	matiter!(word_boundary_ascii1, r"(?-u:\B)x(?-u:\B)", "áxβ");
				92	matiter!(
				93	word_boundary_ascii2,
				94	r"(?-u:\B)",
				95	"0\u{7EF5E}",
				96	(2, 2),
				97	(3, 3),
				98	(4, 4),
				99	(5, 5)
				100	);
				101
				102	// See: https://github.com/rust-lang/regex/issues/264
				103	mat!(ascii_boundary_no_capture, r"(?-u)\B", "\u{28f3e}", Some((0, 0)));
				104	mat!(ascii_boundary_capture, r"(?-u)(\B)", "\u{28f3e}", Some((0, 0)));
				105
				106	// See: https://github.com/rust-lang/regex/issues/271
				107	mat!(end_not_wb, r"$(?-u:\B)", "\u{5c124}\u{b576c}", Some((8, 8)));