Blame - src/utf8.rs - platform/external/rust/crates/regex-syntax

blob: 489729e745eb9d892b9d82cfd1946d3a81adc314 [file] [log] [blame]

Chih-Hung Hsieh	048fc04	2020-04-16 10:44:22 -0700	[diff] [blame]	1	/*!
				2	Converts ranges of Unicode scalar values to equivalent ranges of UTF-8 bytes.
				3
				4	This is sub-module is useful for constructing byte based automatons that need
				5	to embed UTF-8 decoding. The most common use of this module is in conjunction
				6	with the [`hir::ClassUnicodeRange`](../hir/struct.ClassUnicodeRange.html) type.
				7
				8	See the documentation on the `Utf8Sequences` iterator for more details and
				9	an example.
				10
				11	# Wait, what is this?
				12
				13	This is simplest to explain with an example. Let's say you wanted to test
				14	whether a particular byte sequence was a Cyrillic character. One possible
				15	scalar value range is `[0400-04FF]`. The set of allowed bytes for this
				16	range can be expressed as a sequence of byte ranges:
				17
				18	```ignore
				19	[D0-D3][80-BF]
				20	```
				21
				22	This is simple enough: simply encode the boundaries, `0400` encodes to
				23	`D0 80` and `04FF` encodes to `D3 BF`, and create ranges from each
				24	corresponding pair of bytes: `D0` to `D3` and `80` to `BF`.
				25
				26	However, what if you wanted to add the Cyrillic Supplementary characters to
				27	your range? Your range might then become `[0400-052F]`. The same procedure
				28	as above doesn't quite work because `052F` encodes to `D4 AF`. The byte ranges
				29	you'd get from the previous transformation would be `[D0-D4][80-AF]`. However,
				30	this isn't quite correct because this range doesn't capture many characters,
				31	for example, `04FF` (because its last byte, `BF` isn't in the range `80-AF`).
				32
				33	Instead, you need multiple sequences of byte ranges:
				34
				35	```ignore
				36	[D0-D3][80-BF] # matches codepoints 0400-04FF
				37	[D4][80-AF] # matches codepoints 0500-052F
				38	```
				39
				40	This gets even more complicated if you want bigger ranges, particularly if
				41	they naively contain surrogate codepoints. For example, the sequence of byte
				42	ranges for the basic multilingual plane (`[0000-FFFF]`) look like this:
				43
				44	```ignore
				45	[0-7F]
				46	[C2-DF][80-BF]
				47	[E0][A0-BF][80-BF]
				48	[E1-EC][80-BF][80-BF]
				49	[ED][80-9F][80-BF]
				50	[EE-EF][80-BF][80-BF]
				51	```
				52
				53	Note that the byte ranges above will not match any erroneous encoding of
				54	UTF-8, including encodings of surrogate codepoints.
				55
				56	And, of course, for all of Unicode (`[000000-10FFFF]`):
				57
				58	```ignore
				59	[0-7F]
				60	[C2-DF][80-BF]
				61	[E0][A0-BF][80-BF]
				62	[E1-EC][80-BF][80-BF]
				63	[ED][80-9F][80-BF]
				64	[EE-EF][80-BF][80-BF]
				65	[F0][90-BF][80-BF][80-BF]
				66	[F1-F3][80-BF][80-BF][80-BF]
				67	[F4][80-8F][80-BF][80-BF]
				68	```
				69
				70	This module automates the process of creating these byte ranges from ranges of
				71	Unicode scalar values.
				72
				73	# Lineage
				74
				75	I got the idea and general implementation strategy from Russ Cox in his
				76	[article on regexps](https://web.archive.org/web/20160404141123/https://swtch.com/~rsc/regexp/regexp3.html) and RE2.
				77	Russ Cox got it from Ken Thompson's `grep` (no source, folk lore?).
				78	I also got the idea from
				79	[Lucene](https://github.com/apache/lucene-solr/blob/ae93f4e7ac6a3908046391de35d4f50a0d3c59ca/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java),
				80	which uses it for executing automata on their term index.
				81	*/
				82
				83	#![deny(missing_docs)]
				84
				85	use std::char;
				86	use std::fmt;
				87	use std::slice;
				88
				89	const MAX_UTF8_BYTES: usize = 4;
				90
				91	/// Utf8Sequence represents a sequence of byte ranges.
				92	///
				93	/// To match a Utf8Sequence, a candidate byte sequence must match each
				94	/// successive range.
				95	///
				96	/// For example, if there are two ranges, `[C2-DF][80-BF]`, then the byte
				97	/// sequence `\xDD\x61` would not match because `0x61 < 0x80`.
				98	#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
				99	pub enum Utf8Sequence {
				100	/// One byte range.
				101	One(Utf8Range),
				102	/// Two successive byte ranges.
				103	Two([Utf8Range; 2]),
				104	/// Three successive byte ranges.
				105	Three([Utf8Range; 3]),
				106	/// Four successive byte ranges.
				107	Four([Utf8Range; 4]),
				108	}
				109
				110	impl Utf8Sequence {
				111	/// Creates a new UTF-8 sequence from the encoded bytes of a scalar value
				112	/// range.
				113	///
				114	/// This assumes that `start` and `end` have the same length.
				115	fn from_encoded_range(start: &[u8], end: &[u8]) -> Self {
				116	assert_eq!(start.len(), end.len());
				117	match start.len() {
				118	2 => Utf8Sequence::Two([
				119	Utf8Range::new(start[0], end[0]),
				120	Utf8Range::new(start[1], end[1]),
				121	]),
				122	3 => Utf8Sequence::Three([
				123	Utf8Range::new(start[0], end[0]),
				124	Utf8Range::new(start[1], end[1]),
				125	Utf8Range::new(start[2], end[2]),
				126	]),
				127	4 => Utf8Sequence::Four([
				128	Utf8Range::new(start[0], end[0]),
				129	Utf8Range::new(start[1], end[1]),
				130	Utf8Range::new(start[2], end[2]),
				131	Utf8Range::new(start[3], end[3]),
				132	]),
				133	n => unreachable!("invalid encoded length: {}", n),
				134	}
				135	}
				136
				137	/// Returns the underlying sequence of byte ranges as a slice.
				138	pub fn as_slice(&self) -> &[Utf8Range] {
				139	use self::Utf8Sequence::*;
				140	match *self {
				141	One(ref r) => slice::from_ref(r),
				142	Two(ref r) => &r[..],
				143	Three(ref r) => &r[..],
				144	Four(ref r) => &r[..],
				145	}
				146	}
				147
				148	/// Returns the number of byte ranges in this sequence.
				149	///
				150	/// The length is guaranteed to be in the closed interval `[1, 4]`.
				151	pub fn len(&self) -> usize {
				152	self.as_slice().len()
				153	}
				154
				155	/// Reverses the ranges in this sequence.
				156	///
				157	/// For example, if this corresponds to the following sequence:
				158	///
				159	/// ```ignore
				160	/// [D0-D3][80-BF]
				161	/// ```
				162	///
				163	/// Then after reversal, it will be
				164	///
				165	/// ```ignore
				166	/// [80-BF][D0-D3]
				167	/// ```
				168	///
				169	/// This is useful when one is constructing a UTF-8 automaton to match
				170	/// character classes in reverse.
				171	pub fn reverse(&mut self) {
				172	match *self {
				173	Utf8Sequence::One(_) => {}
				174	Utf8Sequence::Two(ref mut x) => x.reverse(),
				175	Utf8Sequence::Three(ref mut x) => x.reverse(),
				176	Utf8Sequence::Four(ref mut x) => x.reverse(),
				177	}
				178	}
				179
				180	/// Returns true if and only if a prefix of `bytes` matches this sequence
				181	/// of byte ranges.
				182	pub fn matches(&self, bytes: &[u8]) -> bool {
				183	if bytes.len() < self.len() {
				184	return false;
				185	}
				186	for (&b, r) in bytes.iter().zip(self) {
				187	if !r.matches(b) {
				188	return false;
				189	}
				190	}
				191	true
				192	}
				193	}
				194
				195	impl<'a> IntoIterator for &'a Utf8Sequence {
				196	type IntoIter = slice::Iter<'a, Utf8Range>;
				197	type Item = &'a Utf8Range;
				198
				199	fn into_iter(self) -> Self::IntoIter {
				200	self.as_slice().into_iter()
				201	}
				202	}
				203
				204	impl fmt::Debug for Utf8Sequence {
				205	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
				206	use self::Utf8Sequence::*;
				207	match *self {
				208	One(ref r) => write!(f, "{:?}", r),
				209	Two(ref r) => write!(f, "{:?}{:?}", r[0], r[1]),
				210	Three(ref r) => write!(f, "{:?}{:?}{:?}", r[0], r[1], r[2]),
				211	Four(ref r) => {
				212	write!(f, "{:?}{:?}{:?}{:?}", r[0], r[1], r[2], r[3])
				213	}
				214	}
				215	}
				216	}
				217
				218	/// A single inclusive range of UTF-8 bytes.
				219	#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
				220	pub struct Utf8Range {
				221	/// Start of byte range (inclusive).
				222	pub start: u8,
				223	/// End of byte range (inclusive).
				224	pub end: u8,
				225	}
				226
				227	impl Utf8Range {
				228	fn new(start: u8, end: u8) -> Self {
				229	Utf8Range { start, end }
				230	}
				231
				232	/// Returns true if and only if the given byte is in this range.
				233	pub fn matches(&self, b: u8) -> bool {
				234	self.start <= b && b <= self.end
				235	}
				236	}
				237
				238	impl fmt::Debug for Utf8Range {
				239	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
				240	if self.start == self.end {
				241	write!(f, "[{:X}]", self.start)
				242	} else {
				243	write!(f, "[{:X}-{:X}]", self.start, self.end)
				244	}
				245	}
				246	}
				247
				248	/// An iterator over ranges of matching UTF-8 byte sequences.
				249	///
				250	/// The iteration represents an alternation of comprehensive byte sequences
				251	/// that match precisely the set of UTF-8 encoded scalar values.
				252	///
				253	/// A byte sequence corresponds to one of the scalar values in the range given
				254	/// if and only if it completely matches exactly one of the sequences of byte
				255	/// ranges produced by this iterator.
				256	///
				257	/// Each sequence of byte ranges matches a unique set of bytes. That is, no two
				258	/// sequences will match the same bytes.
				259	///
				260	/// # Example
				261	///
				262	/// This shows how to match an arbitrary byte sequence against a range of
				263	/// scalar values.
				264	///
				265	/// ```rust
				266	/// use regex_syntax::utf8::{Utf8Sequences, Utf8Sequence};
				267	///
				268	/// fn matches(seqs: &[Utf8Sequence], bytes: &[u8]) -> bool {
				269	/// for range in seqs {
				270	/// if range.matches(bytes) {
				271	/// return true;
				272	/// }
				273	/// }
				274	/// false
				275	/// }
				276	///
				277	/// // Test the basic multilingual plane.
				278	/// let seqs: Vec<_> = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect();
				279	///
				280	/// // UTF-8 encoding of 'a'.
				281	/// assert!(matches(&seqs, &[0x61]));
				282	/// // UTF-8 encoding of '☃' (`\u{2603}`).
				283	/// assert!(matches(&seqs, &[0xE2, 0x98, 0x83]));
				284	/// // UTF-8 encoding of `\u{10348}` (outside the BMP).
				285	/// assert!(!matches(&seqs, &[0xF0, 0x90, 0x8D, 0x88]));
				286	/// // Tries to match against a UTF-8 encoding of a surrogate codepoint,
				287	/// // which is invalid UTF-8, and therefore fails, despite the fact that
				288	/// // the corresponding codepoint (0xD800) falls in the range given.
				289	/// assert!(!matches(&seqs, &[0xED, 0xA0, 0x80]));
				290	/// // And fails against plain old invalid UTF-8.
				291	/// assert!(!matches(&seqs, &[0xFF, 0xFF]));
				292	/// ```
				293	///
				294	/// If this example seems circuitous, that's because it is! It's meant to be
				295	/// illustrative. In practice, you could just try to decode your byte sequence
				296	/// and compare it with the scalar value range directly. However, this is not
				297	/// always possible (for example, in a byte based automaton).
				298	pub struct Utf8Sequences {
				299	range_stack: Vec<ScalarRange>,
				300	}
				301
				302	impl Utf8Sequences {
				303	/// Create a new iterator over UTF-8 byte ranges for the scalar value range
				304	/// given.
				305	pub fn new(start: char, end: char) -> Self {
				306	let mut it = Utf8Sequences { range_stack: vec![] };
				307	it.push(start as u32, end as u32);
				308	it
				309	}
				310
				311	/// reset resets the scalar value range.
				312	/// Any existing state is cleared, but resources may be reused.
				313	///
				314	/// N.B. Benchmarks say that this method is dubious.
				315	#[doc(hidden)]
				316	pub fn reset(&mut self, start: char, end: char) {
				317	self.range_stack.clear();
				318	self.push(start as u32, end as u32);
				319	}
				320
				321	fn push(&mut self, start: u32, end: u32) {
				322	self.range_stack.push(ScalarRange { start, end });
				323	}
				324	}
				325
				326	struct ScalarRange {
				327	start: u32,
				328	end: u32,
				329	}
				330
				331	impl fmt::Debug for ScalarRange {
				332	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
				333	write!(f, "ScalarRange({:X}, {:X})", self.start, self.end)
				334	}
				335	}
				336
				337	impl Iterator for Utf8Sequences {
				338	type Item = Utf8Sequence;
				339
				340	fn next(&mut self) -> Option<Self::Item> {
				341	'TOP: while let Some(mut r) = self.range_stack.pop() {
				342	'INNER: loop {
				343	if let Some((r1, r2)) = r.split() {
				344	self.push(r2.start, r2.end);
				345	r.start = r1.start;
				346	r.end = r1.end;
				347	continue 'INNER;
				348	}
				349	if !r.is_valid() {
				350	continue 'TOP;
				351	}
				352	for i in 1..MAX_UTF8_BYTES {
				353	let max = max_scalar_value(i);
				354	if r.start <= max && max < r.end {
				355	self.push(max + 1, r.end);
				356	r.end = max;
				357	continue 'INNER;
				358	}
				359	}
				360	if let Some(ascii_range) = r.as_ascii() {
				361	return Some(Utf8Sequence::One(ascii_range));
				362	}
				363	for i in 1..MAX_UTF8_BYTES {
				364	let m = (1 << (6 * i)) - 1;
				365	if (r.start & !m) != (r.end & !m) {
				366	if (r.start & m) != 0 {
				367	self.push((r.start \| m) + 1, r.end);
				368	r.end = r.start \| m;
				369	continue 'INNER;
				370	}
				371	if (r.end & m) != m {
				372	self.push(r.end & !m, r.end);
				373	r.end = (r.end & !m) - 1;
				374	continue 'INNER;
				375	}
				376	}
				377	}
				378	let mut start = [0; MAX_UTF8_BYTES];
				379	let mut end = [0; MAX_UTF8_BYTES];
				380	let n = r.encode(&mut start, &mut end);
				381	return Some(Utf8Sequence::from_encoded_range(
				382	&start[0..n],
				383	&end[0..n],
				384	));
				385	}
				386	}
				387	None
				388	}
				389	}
				390
				391	impl ScalarRange {
				392	/// split splits this range if it overlaps with a surrogate codepoint.
				393	///
				394	/// Either or both ranges may be invalid.
				395	fn split(&self) -> Option<(ScalarRange, ScalarRange)> {
				396	if self.start < 0xE000 && self.end > 0xD7FF {
				397	Some((
				398	ScalarRange { start: self.start, end: 0xD7FF },
				399	ScalarRange { start: 0xE000, end: self.end },
				400	))
				401	} else {
				402	None
				403	}
				404	}
				405
				406	/// is_valid returns true if and only if start <= end.
				407	fn is_valid(&self) -> bool {
				408	self.start <= self.end
				409	}
				410
				411	/// as_ascii returns this range as a Utf8Range if and only if all scalar
				412	/// values in this range can be encoded as a single byte.
				413	fn as_ascii(&self) -> Option<Utf8Range> {
				414	if self.is_ascii() {
				415	Some(Utf8Range::new(self.start as u8, self.end as u8))
				416	} else {
				417	None
				418	}
				419	}
				420
				421	/// is_ascii returns true if the range is ASCII only (i.e., takes a single
				422	/// byte to encode any scalar value).
				423	fn is_ascii(&self) -> bool {
				424	self.is_valid() && self.end <= 0x7f
				425	}
				426
				427	/// encode writes the UTF-8 encoding of the start and end of this range
				428	/// to the corresponding destination slices, and returns the number of
				429	/// bytes written.
				430	///
				431	/// The slices should have room for at least `MAX_UTF8_BYTES`.
				432	fn encode(&self, start: &mut [u8], end: &mut [u8]) -> usize {
				433	let cs = char::from_u32(self.start).unwrap();
				434	let ce = char::from_u32(self.end).unwrap();
				435	let ss = cs.encode_utf8(start);
				436	let se = ce.encode_utf8(end);
				437	assert_eq!(ss.len(), se.len());
				438	ss.len()
				439	}
				440	}
				441
				442	fn max_scalar_value(nbytes: usize) -> u32 {
				443	match nbytes {
				444	1 => 0x007F,
				445	2 => 0x07FF,
				446	3 => 0xFFFF,
				447	4 => 0x10FFFF,
				448	_ => unreachable!("invalid UTF-8 byte sequence size"),
				449	}
				450	}
				451
				452	#[cfg(test)]
				453	mod tests {
				454	use std::char;
				455
				456	use utf8::{Utf8Range, Utf8Sequences};
				457
				458	fn rutf8(s: u8, e: u8) -> Utf8Range {
				459	Utf8Range::new(s, e)
				460	}
				461
				462	fn never_accepts_surrogate_codepoints(start: char, end: char) {
				463	for cp in 0xD800..0xE000 {
				464	let buf = encode_surrogate(cp);
				465	for r in Utf8Sequences::new(start, end) {
				466	if r.matches(&buf) {
				467	panic!(
				468	"Sequence ({:X}, {:X}) contains range {:?}, \
				469	which matches surrogate code point {:X} \
				470	with encoded bytes {:?}",
				471	start as u32, end as u32, r, cp, buf,
				472	);
				473	}
				474	}
				475	}
				476	}
				477
				478	#[test]
				479	fn codepoints_no_surrogates() {
				480	never_accepts_surrogate_codepoints('\u{0}', '\u{FFFF}');
				481	never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFF}');
				482	never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFE}');
				483	never_accepts_surrogate_codepoints('\u{80}', '\u{10FFFF}');
				484	never_accepts_surrogate_codepoints('\u{D7FF}', '\u{E000}');
				485	}
				486
				487	#[test]
				488	fn single_codepoint_one_sequence() {
				489	// Tests that every range of scalar values that contains a single
				490	// scalar value is recognized by one sequence of byte ranges.
				491	for i in 0x0..(0x10FFFF + 1) {
				492	let c = match char::from_u32(i) {
				493	None => continue,
				494	Some(c) => c,
				495	};
				496	let seqs: Vec<_> = Utf8Sequences::new(c, c).collect();
				497	assert_eq!(seqs.len(), 1);
				498	}
				499	}
				500
				501	#[test]
				502	fn bmp() {
				503	use utf8::Utf8Sequence::*;
				504
				505	let seqs = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect::<Vec<_>>();
				506	assert_eq!(
				507	seqs,
				508	vec![
				509	One(rutf8(0x0, 0x7F)),
				510	Two([rutf8(0xC2, 0xDF), rutf8(0x80, 0xBF)]),
				511	Three([
				512	rutf8(0xE0, 0xE0),
				513	rutf8(0xA0, 0xBF),
				514	rutf8(0x80, 0xBF)
				515	]),
				516	Three([
				517	rutf8(0xE1, 0xEC),
				518	rutf8(0x80, 0xBF),
				519	rutf8(0x80, 0xBF)
				520	]),
				521	Three([
				522	rutf8(0xED, 0xED),
				523	rutf8(0x80, 0x9F),
				524	rutf8(0x80, 0xBF)
				525	]),
				526	Three([
				527	rutf8(0xEE, 0xEF),
				528	rutf8(0x80, 0xBF),
				529	rutf8(0x80, 0xBF)
				530	]),
				531	]
				532	);
				533	}
				534
				535	#[test]
				536	fn reverse() {
				537	use utf8::Utf8Sequence::*;
				538
				539	let mut s = One(rutf8(0xA, 0xB));
				540	s.reverse();
				541	assert_eq!(s.as_slice(), &[rutf8(0xA, 0xB)]);
				542
				543	let mut s = Two([rutf8(0xA, 0xB), rutf8(0xB, 0xC)]);
				544	s.reverse();
				545	assert_eq!(s.as_slice(), &[rutf8(0xB, 0xC), rutf8(0xA, 0xB)]);
				546
				547	let mut s = Three([rutf8(0xA, 0xB), rutf8(0xB, 0xC), rutf8(0xC, 0xD)]);
				548	s.reverse();
				549	assert_eq!(
				550	s.as_slice(),
				551	&[rutf8(0xC, 0xD), rutf8(0xB, 0xC), rutf8(0xA, 0xB)]
				552	);
				553
				554	let mut s = Four([
				555	rutf8(0xA, 0xB),
				556	rutf8(0xB, 0xC),
				557	rutf8(0xC, 0xD),
				558	rutf8(0xD, 0xE),
				559	]);
				560	s.reverse();
				561	assert_eq!(
				562	s.as_slice(),
				563	&[
				564	rutf8(0xD, 0xE),
				565	rutf8(0xC, 0xD),
				566	rutf8(0xB, 0xC),
				567	rutf8(0xA, 0xB)
				568	]
				569	);
				570	}
				571
				572	fn encode_surrogate(cp: u32) -> [u8; 3] {
				573	const TAG_CONT: u8 = 0b1000_0000;
				574	const TAG_THREE_B: u8 = 0b1110_0000;
				575
				576	assert!(0xD800 <= cp && cp < 0xE000);
				577	let mut dst = [0; 3];
				578	dst[0] = (cp >> 12 & 0x0F) as u8 \| TAG_THREE_B;
				579	dst[1] = (cp >> 6 & 0x3F) as u8 \| TAG_CONT;
				580	dst[2] = (cp & 0x3F) as u8 \| TAG_CONT;
				581	dst
				582	}
				583	}