Blame - src/utf8.rs - platform/external/rust/crates/regex

blob: 6e0608fdbaf2e95720dd8406cdbd59d902d8f58e [file] [log] [blame]

Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame^]	1	/// A few elementary UTF-8 encoding and decoding functions used by the matching
				2	/// engines.
				3	///
				4	/// In an ideal world, the matching engines operate on `&str` and we can just
				5	/// lean on the standard library for all our UTF-8 needs. However, to support
				6	/// byte based regexes (that can match on arbitrary bytes which may contain
				7	/// UTF-8), we need to be capable of searching and decoding UTF-8 on a `&[u8]`.
				8	/// The standard library doesn't really recognize this use case, so we have
				9	/// to build it out ourselves.
				10	///
				11	/// Should this be factored out into a separate crate? It seems independently
				12	/// useful. There are other crates that already exist (e.g., `utf-8`) that have
				13	/// overlapping use cases. Not sure what to do.
				14	use std::char;
				15
				16	const TAG_CONT: u8 = 0b1000_0000;
				17	const TAG_TWO: u8 = 0b1100_0000;
				18	const TAG_THREE: u8 = 0b1110_0000;
				19	const TAG_FOUR: u8 = 0b1111_0000;
				20
				21	/// Returns the smallest possible index of the next valid UTF-8 sequence
				22	/// starting after `i`.
				23	pub fn next_utf8(text: &[u8], i: usize) -> usize {
				24	let b = match text.get(i) {
				25	None => return i + 1,
				26	Some(&b) => b,
				27	};
				28	let inc = if b <= 0x7F {
				29	1
				30	} else if b <= 0b110_11111 {
				31	2
				32	} else if b <= 0b1110_1111 {
				33	3
				34	} else {
				35	4
				36	};
				37	i + inc
				38	}
				39
				40	/// Decode a single UTF-8 sequence into a single Unicode codepoint from `src`.
				41	///
				42	/// If no valid UTF-8 sequence could be found, then `None` is returned.
				43	/// Otherwise, the decoded codepoint and the number of bytes read is returned.
				44	/// The number of bytes read (for a valid UTF-8 sequence) is guaranteed to be
				45	/// 1, 2, 3 or 4.
				46	///
				47	/// Note that a UTF-8 sequence is invalid if it is incorrect UTF-8, encodes a
				48	/// codepoint that is out of range (surrogate codepoints are out of range) or
				49	/// is not the shortest possible UTF-8 sequence for that codepoint.
				50	#[inline]
				51	pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
				52	let b0 = match src.get(0) {
				53	None => return None,
				54	Some(&b) if b <= 0x7F => return Some((b as char, 1)),
				55	Some(&b) => b,
				56	};
				57	match b0 {
				58	0b110_00000..=0b110_11111 => {
				59	if src.len() < 2 {
				60	return None;
				61	}
				62	let b1 = src[1];
				63	if 0b11_000000 & b1 != TAG_CONT {
				64	return None;
				65	}
				66	let cp = ((b0 & !TAG_TWO) as u32) << 6 \| ((b1 & !TAG_CONT) as u32);
				67	match cp {
				68	0x80..=0x7FF => char::from_u32(cp).map(\|cp\| (cp, 2)),
				69	_ => None,
				70	}
				71	}
				72	0b1110_0000..=0b1110_1111 => {
				73	if src.len() < 3 {
				74	return None;
				75	}
				76	let (b1, b2) = (src[1], src[2]);
				77	if 0b11_000000 & b1 != TAG_CONT {
				78	return None;
				79	}
				80	if 0b11_000000 & b2 != TAG_CONT {
				81	return None;
				82	}
				83	let cp = ((b0 & !TAG_THREE) as u32) << 12
				84	\| ((b1 & !TAG_CONT) as u32) << 6
				85	\| ((b2 & !TAG_CONT) as u32);
				86	match cp {
				87	// char::from_u32 will disallow surrogate codepoints.
				88	0x800..=0xFFFF => char::from_u32(cp).map(\|cp\| (cp, 3)),
				89	_ => None,
				90	}
				91	}
				92	0b11110_000..=0b11110_111 => {
				93	if src.len() < 4 {
				94	return None;
				95	}
				96	let (b1, b2, b3) = (src[1], src[2], src[3]);
				97	if 0b11_000000 & b1 != TAG_CONT {
				98	return None;
				99	}
				100	if 0b11_000000 & b2 != TAG_CONT {
				101	return None;
				102	}
				103	if 0b11_000000 & b3 != TAG_CONT {
				104	return None;
				105	}
				106	let cp = ((b0 & !TAG_FOUR) as u32) << 18
				107	\| ((b1 & !TAG_CONT) as u32) << 12
				108	\| ((b2 & !TAG_CONT) as u32) << 6
				109	\| ((b3 & !TAG_CONT) as u32);
				110	match cp {
				111	0x10000..=0x10FFFF => char::from_u32(cp).map(\|cp\| (cp, 4)),
				112	_ => None,
				113	}
				114	}
				115	_ => None,
				116	}
				117	}
				118
				119	/// Like `decode_utf8`, but decodes the last UTF-8 sequence in `src` instead
				120	/// of the first.
				121	pub fn decode_last_utf8(src: &[u8]) -> Option<(char, usize)> {
				122	if src.is_empty() {
				123	return None;
				124	}
				125	let mut start = src.len() - 1;
				126	if src[start] <= 0x7F {
				127	return Some((src[start] as char, 1));
				128	}
				129	while start > src.len().saturating_sub(4) {
				130	start -= 1;
				131	if is_start_byte(src[start]) {
				132	break;
				133	}
				134	}
				135	match decode_utf8(&src[start..]) {
				136	None => None,
				137	Some((_, n)) if n < src.len() - start => None,
				138	Some((cp, n)) => Some((cp, n)),
				139	}
				140	}
				141
				142	fn is_start_byte(b: u8) -> bool {
				143	b & 0b11_000000 != 0b1_0000000
				144	}
				145
				146	#[cfg(test)]
				147	mod tests {
				148	use std::str;
				149
				150	use quickcheck::quickcheck;
				151
				152	use super::{
				153	decode_last_utf8, decode_utf8, TAG_CONT, TAG_FOUR, TAG_THREE, TAG_TWO,
				154	};
				155
				156	#[test]
				157	fn prop_roundtrip() {
				158	fn p(given_cp: char) -> bool {
				159	let mut tmp = [0; 4];
				160	let encoded_len = given_cp.encode_utf8(&mut tmp).len();
				161	let (got_cp, got_len) = decode_utf8(&tmp[..encoded_len]).unwrap();
				162	encoded_len == got_len && given_cp == got_cp
				163	}
				164	quickcheck(p as fn(char) -> bool)
				165	}
				166
				167	#[test]
				168	fn prop_roundtrip_last() {
				169	fn p(given_cp: char) -> bool {
				170	let mut tmp = [0; 4];
				171	let encoded_len = given_cp.encode_utf8(&mut tmp).len();
				172	let (got_cp, got_len) =
				173	decode_last_utf8(&tmp[..encoded_len]).unwrap();
				174	encoded_len == got_len && given_cp == got_cp
				175	}
				176	quickcheck(p as fn(char) -> bool)
				177	}
				178
				179	#[test]
				180	fn prop_encode_matches_std() {
				181	fn p(cp: char) -> bool {
				182	let mut got = [0; 4];
				183	let n = cp.encode_utf8(&mut got).len();
				184	let expected = cp.to_string();
				185	&got[..n] == expected.as_bytes()
				186	}
				187	quickcheck(p as fn(char) -> bool)
				188	}
				189
				190	#[test]
				191	fn prop_decode_matches_std() {
				192	fn p(given_cp: char) -> bool {
				193	let mut tmp = [0; 4];
				194	let n = given_cp.encode_utf8(&mut tmp).len();
				195	let (got_cp, _) = decode_utf8(&tmp[..n]).unwrap();
				196	let expected_cp =
				197	str::from_utf8(&tmp[..n]).unwrap().chars().next().unwrap();
				198	got_cp == expected_cp
				199	}
				200	quickcheck(p as fn(char) -> bool)
				201	}
				202
				203	#[test]
				204	fn prop_decode_last_matches_std() {
				205	fn p(given_cp: char) -> bool {
				206	let mut tmp = [0; 4];
				207	let n = given_cp.encode_utf8(&mut tmp).len();
				208	let (got_cp, _) = decode_last_utf8(&tmp[..n]).unwrap();
				209	let expected_cp = str::from_utf8(&tmp[..n])
				210	.unwrap()
				211	.chars()
				212	.rev()
				213	.next()
				214	.unwrap();
				215	got_cp == expected_cp
				216	}
				217	quickcheck(p as fn(char) -> bool)
				218	}
				219
				220	#[test]
				221	fn reject_invalid() {
				222	// Invalid start byte
				223	assert_eq!(decode_utf8(&[0xFF]), None);
				224	// Surrogate pair
				225	assert_eq!(decode_utf8(&[0xED, 0xA0, 0x81]), None);
				226	// Invalid continuation byte.
				227	assert_eq!(decode_utf8(&[0xD4, 0xC2]), None);
				228	// Bad lengths
				229	assert_eq!(decode_utf8(&[0xC3]), None); // 2 bytes
				230	assert_eq!(decode_utf8(&[0xEF, 0xBF]), None); // 3 bytes
				231	assert_eq!(decode_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes
				232	// Not a minimal UTF-8 sequence
				233	assert_eq!(decode_utf8(&[TAG_TWO, TAG_CONT \| b'a']), None);
				234	assert_eq!(decode_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT \| b'a']), None);
				235	assert_eq!(
				236	decode_utf8(&[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT \| b'a',]),
				237	None
				238	);
				239	}
				240
				241	#[test]
				242	fn reject_invalid_last() {
				243	// Invalid start byte
				244	assert_eq!(decode_last_utf8(&[0xFF]), None);
				245	// Surrogate pair
				246	assert_eq!(decode_last_utf8(&[0xED, 0xA0, 0x81]), None);
				247	// Bad lengths
				248	assert_eq!(decode_last_utf8(&[0xC3]), None); // 2 bytes
				249	assert_eq!(decode_last_utf8(&[0xEF, 0xBF]), None); // 3 bytes
				250	assert_eq!(decode_last_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes
				251	// Not a minimal UTF-8 sequence
				252	assert_eq!(decode_last_utf8(&[TAG_TWO, TAG_CONT \| b'a']), None);
				253	assert_eq!(
				254	decode_last_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT \| b'a',]),
				255	None
				256	);
				257	assert_eq!(
				258	decode_last_utf8(
				259	&[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT \| b'a',]
				260	),
				261	None
				262	);
				263	}
				264	}