Blame - src/lib.rs - platform/external/rust/crates/cesu8

blob: 7fac5ffbe9a60c9604e8e3d990cc86b8bfbee249 [file] [log] [blame]

Ivan Lozano	a7e4bc0	2021-08-20 09:59:16 -0400	[diff] [blame]	1	// Copyright 2012-2014 The Rust Project Developers and Eric Kidd. See the
				2	// COPYRIGHT-RUST.txt file at the top-level directory of this distribution.
				3	//
				4	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
				5	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
				6	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
				7	// option. This file may not be copied, modified, or distributed except
				8	// according to those terms.
				9
				10
				11	//! A simple library implementing the [CESU-8 compatibility encoding
				12	//! scheme](http://www.unicode.org/reports/tr26/tr26-2.html). This is a
				13	//! non-standard variant of UTF-8 that is used internally by some systems
				14	//! that need to represent UTF-16 data as 8-bit characters. Yes, this is
				15	//! ugly.
				16	//!
				17	//! Use of this encoding is discouraged by the Unicode Consortium. It's OK
				18	//! for working with existing internal APIs, but it should not be used for
				19	//! transmitting or storing data.
				20	//!
				21	//! ```
				22	//! use std::borrow::Cow;
				23	//! use cesu8::{from_cesu8, to_cesu8};
				24	//!
				25	//! // 16-bit Unicode characters are the same in UTF-8 and CESU-8.
				26	//! assert_eq!(Cow::Borrowed("aé日".as_bytes()),
				27	//! to_cesu8("aé日"));
				28	//! assert_eq!(Cow::Borrowed("aé日"),
				29	//! from_cesu8("aé日".as_bytes()).unwrap());
				30	//!
				31	//! // This string is CESU-8 data containing a 6-byte surrogate pair,
				32	//! // which decodes to a 4-byte UTF-8 string.
				33	//! let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
				34	//! assert_eq!(Cow::Borrowed("\u{10401}"),
				35	//! from_cesu8(data).unwrap());
				36	//! ```
				37	//!
				38	//! ### A note about security
				39	//!
				40	//! As a general rule, this library is intended to fail on malformed or
				41	//! unexpected input. CESU-8 is supposed to be an internal-only format,
				42	//! and if we're seeing malformed data, we assume that it's either a bug in
				43	//! somebody's code, or an attacker is trying to improperly encode data to
				44	//! evade security checks.
				45	//!
				46	//! If you have a use case for lossy conversion to UTF-8, or conversion
				47	//! from mixed UTF-8/CESU-8 data, please feel free to submit a pull request
				48	//! for `from_cesu8_lossy_permissive` with appropriate behavior.
				49	//!
				50	//! ### Java and U+0000, and other variants
				51	//!
				52	//! Java uses the CESU-8 encoding as described above, but with one
				53	//! difference: The null character U+0000 is represented as an overlong
				54	//! UTF-8 sequence `C0 80`. This is supported by the `from_java_cesu8` and
				55	//! `to_java_cesu8` methods.
				56	//!
				57	//! ### Surrogate pairs and UTF-8
				58	//!
				59	//! The UTF-16 encoding uses "surrogate pairs" to represent Unicode code
				60	//! points in the range from U+10000 to U+10FFFF. These are 16-bit numbers
				61	//! in the range 0xD800 to 0xDFFF.
				62	//!
				63	//! * 0xD800 to 0xDBFF: First half of surrogate pair. When encoded as
				64	//! CESU-8, these become 11101101 10100000 10000000 to
				65	//! 11101101 10101111 10111111.
				66	//!
				67	//! * 0xDC00 to 0xDFFF: Second half of surrogate pair. These become
				68	//! 11101101 10110000 10000000 to
				69	//! 11101101 10111111 10111111.
				70	//!
				71	//! Wikipedia [explains](http://en.wikipedia.org/wiki/UTF-16) the
				72	//! code point to UTF-16 conversion process:
				73	//!
				74	//! > Consider the encoding of U+10437 (𐐷):
				75	//! >
				76	//! > * Subtract 0x10000 from 0x10437. The result is 0x00437, 0000 0000 0100
				77	//! > 0011 0111.
				78	//! > * Split this into the high 10-bit value and the low 10-bit value:
				79	//! > 0000000001 and 0000110111.
				80	//! > * Add 0xD800 to the high value to form the high surrogate: 0xD800 +
				81	//! > 0x0001 = 0xD801.
				82	//! > * Add 0xDC00 to the low value to form the low surrogate: 0xDC00 +
				83	//! > 0x0037 = 0xDC37.
				84
				85	#![warn(missing_docs)]
				86
				87
				88	use std::borrow::Cow;
				89	use std::error::Error;
				90	use std::fmt;
				91	use std::result::Result;
				92	use std::slice;
				93	use std::str::{from_utf8, from_utf8_unchecked};
				94	use unicode::utf8_char_width;
				95
				96	mod unicode;
				97
				98	/// Mask of the value bits of a continuation byte.
				99	const CONT_MASK: u8 = 0b0011_1111u8;
				100	/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte.
				101	const TAG_CONT_U8: u8 = 0b1000_0000u8;
				102
				103	/// The CESU-8 data could not be decoded as valid UTF-8 data.
				104	#[derive(Clone, Copy, Debug)]
				105	pub struct Cesu8DecodingError;
				106
				107	impl Error for Cesu8DecodingError {
				108	fn description(&self) -> &str { "decoding error" }
				109	fn cause(&self) -> Option<&Error> { None }
				110	}
				111
				112	impl fmt::Display for Cesu8DecodingError {
				113	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
				114	write!(f, "could not convert CESU-8 data to UTF-8")
				115	}
				116	}
				117
				118	/// Which variant of the encoding are we working with?
				119	#[derive(PartialEq, Eq)]
				120	enum Variant {
				121	/// Regular CESU-8, with '\0' represented by itself.
				122	Standard,
				123	/// This is technically Java's "Modified UTF-8", which is supposedly
				124	/// like CESU-8, except that it UTF-8 encodes the '\0' byte. I'm sure
				125	/// it seemed like a good idea at the time.
				126	Java,
				127	}
				128
				129	/// Convert CESU-8 data to a Rust string, re-encoding only if necessary.
				130	/// Returns an error if the data cannot be represented as valid UTF-8.
				131	///
				132	/// ```
				133	/// use std::borrow::Cow;
				134	/// use cesu8::from_cesu8;
				135	///
				136	/// // This string is valid as UTF-8 or CESU-8, so it doesn't change,
				137	/// // and we can convert it without allocating memory.
				138	/// assert_eq!(Cow::Borrowed("aé日"),
				139	/// from_cesu8("aé日".as_bytes()).unwrap());
				140	///
				141	/// // This string is CESU-8 data containing a 6-byte surrogate pair,
				142	/// // which becomes a 4-byte UTF-8 string.
				143	/// let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
				144	/// assert_eq!(Cow::Borrowed("\u{10401}"),
				145	/// from_cesu8(data).unwrap());
				146	/// ```
				147	pub fn from_cesu8(bytes: &[u8]) -> Result<Cow<str>, Cesu8DecodingError> {
				148	from_cesu8_internal(bytes, Variant::Standard)
				149	}
				150
				151	/// Convert Java's modified UTF-8 data to a Rust string, re-encoding only if
				152	/// necessary. Returns an error if the data cannot be represented as valid
				153	/// UTF-8.
				154	///
				155	/// ```
				156	/// use std::borrow::Cow;
				157	/// use cesu8::from_java_cesu8;
				158	///
				159	/// // This string is valid as UTF-8 or modified UTF-8, so it doesn't change,
				160	/// // and we can convert it without allocating memory.
				161	/// assert_eq!(Cow::Borrowed("aé日"),
				162	/// from_java_cesu8("aé日".as_bytes()).unwrap());
				163	///
				164	/// // This string is modified UTF-8 data containing a 6-byte surrogate pair,
				165	/// // which becomes a 4-byte UTF-8 string.
				166	/// let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
				167	/// assert_eq!(Cow::Borrowed("\u{10401}"),
				168	/// from_java_cesu8(data).unwrap());
				169	///
				170	/// // This string is modified UTF-8 data containing null code-points.
				171	/// let data = &[0xC0, 0x80, 0xC0, 0x80];
				172	/// assert_eq!(Cow::Borrowed("\0\0"),
				173	/// from_java_cesu8(data).unwrap());
				174	/// ```
				175	pub fn from_java_cesu8(bytes: &[u8]) -> Result<Cow<str>, Cesu8DecodingError> {
				176	from_cesu8_internal(bytes, Variant::Java)
				177	}
				178
				179	/// Do the actual work of decoding.
				180	fn from_cesu8_internal(bytes: &[u8], variant: Variant) ->
				181	Result<Cow<str>, Cesu8DecodingError>
				182	{
				183	match from_utf8(bytes) {
				184	Ok(str) => Ok(Cow::Borrowed(str)),
				185	_ => {
				186	let mut decoded = Vec::with_capacity(bytes.len());
				187	if decode_from_iter(&mut decoded, &mut bytes.iter(), variant) {
				188	// Keep this assertion in debug mode only. It's important
				189	// that this assertion is true, because Rust assumes that
				190	// all UTF-8 strings are valid.
				191	debug_assert!(from_utf8(&decoded[..]).is_ok());
				192	Ok(Cow::Owned(unsafe { String::from_utf8_unchecked(decoded) }))
				193	} else {
				194	Err(Cesu8DecodingError)
				195	}
				196	}
				197	}
				198	}
				199
				200	#[test]
				201	fn test_from_cesu8() {
				202	// The surrogate-encoded character below is from the ICU library's
				203	// icu/source/test/testdata/conversion.txt test case.
				204	let data = &[0x4D, 0xE6, 0x97, 0xA5, 0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81, 0x7F];
				205	assert_eq!(Cow::Borrowed("M日\u{10401}\u{7F}"),
				206	from_cesu8(data).unwrap());
				207
				208	// We used to have test data from the CESU-8 specification, but when we
				209	// worked it through manually, we got the wrong answer:
				210	//
				211	// Input: [0xED, 0xAE, 0x80, 0xED, 0xB0, 0x80]
				212	// Binary: 11101101 10101110 10000000 11101101 10110000 10000000
				213	//
				214	// 0b1101_101110_000000 -> 0xDB80
				215	// 0b1101_110000_000000 -> 0xDC00
				216	//
				217	// ((0xDB80 - 0xD800) << 10) \| (0xDC00 - 0xDC00) -> 0xE0000
				218	// 0x10000 + 0xE0000 -> 0xF0000
				219	//
				220	// The spec claims that we are supposed to get 0x10000, not 0xF0000.
				221	// Since I can't reconcile this example data with the text of the
				222	// specification, I decided to use a test character from ICU instead.
				223	}
				224
				225	// Our internal decoder, based on Rust's is_utf8 implementation.
				226	fn decode_from_iter(
				227	decoded: &mut Vec<u8>, iter: &mut slice::Iter<u8>, variant: Variant)
				228	-> bool
				229	{
				230	macro_rules! err {
				231	() => { return false }
				232	}
				233	macro_rules! next {
				234	() => {
				235	match iter.next() {
				236	Some(a) => *a,
				237	// We needed data, but there was none: error!
				238	None => err!()
				239	}
				240	}
				241	}
				242	macro_rules! next_cont {
				243	() => {
				244	{
				245	let byte = next!();
				246	if (byte) & !CONT_MASK == TAG_CONT_U8 { byte } else { err!() }
				247	}
				248	}
				249	}
				250
				251	loop {
				252	let first = match iter.next() {
				253	Some(&b) => b,
				254	// We're at the end of the iterator and a codepoint boundary at
				255	// the same time, so this string is valid.
				256	None => return true
				257	};
				258
				259	if variant == Variant::Java && first == 0 {
				260	// Java's modified UTF-8 should never contain \0 directly.
				261	err!();
				262	} else if first < 128 {
				263	// Pass ASCII through directly.
				264	decoded.push(first);
				265	} else if first == 0xc0 && variant == Variant::Java {
				266	match next!() {
				267	0x80 => decoded.push(0),
				268	_ => err!(),
				269	}
				270	} else {
				271	let w = utf8_char_width(first);
				272	let second = next_cont!();
				273	match w {
				274	// Two-byte sequences can be used directly.
				275	2 => { decoded.extend([first, second].iter().cloned()); }
				276	3 => {
				277	let third = next_cont!();
				278	match (first, second) {
				279	// These are valid UTF-8, so pass them through.
				280	(0xE0 , 0xA0 ... 0xBF) \|
				281	(0xE1 ... 0xEC, 0x80 ... 0xBF) \|
				282	(0xED , 0x80 ... 0x9F) \|
				283	(0xEE ... 0xEF, 0x80 ... 0xBF) => {
				284	decoded.extend([first, second, third].iter()
				285	.cloned())
				286	}
				287	// First half a surrogate pair, so decode.
				288	(0xED , 0xA0 ... 0xAF) => {
				289	if next!() != 0xED { err!() }
				290	let fifth = next_cont!();
				291	if fifth < 0xB0 \|\| 0xBF < fifth { err!() }
				292	let sixth = next_cont!();
				293	let s = dec_surrogates(second, third, fifth, sixth);
				294	decoded.extend(s.iter().cloned());
				295	}
				296	_ => err!()
				297	}
				298	}
				299	_ => err!()
				300	}
				301	}
				302	}
				303	}
				304
				305	/// Convert the two trailing bytes from a CESU-8 surrogate to a regular
				306	/// surrogate value.
				307	fn dec_surrogate(second: u8, third: u8) -> u32 {
				308	0xD000u32 \| ((second & CONT_MASK) as u32) << 6 \| (third & CONT_MASK) as u32
				309	}
				310
				311	/// Convert the bytes from a CESU-8 surrogate pair into a valid UTF-8
				312	/// sequence. Assumes input is valid.
				313	fn dec_surrogates(second: u8, third: u8, fifth: u8, sixth: u8) -> [u8; 4] {
				314	// Convert to a 32-bit code point.
				315	let s1 = dec_surrogate(second, third);
				316	let s2 = dec_surrogate(fifth, sixth);
				317	let c = 0x10000 + (((s1 - 0xD800) << 10) \| (s2 - 0xDC00));
				318	//println!("{:0>8b} {:0>8b} {:0>8b} -> {:0>16b}", 0xEDu8, second, third, s1);
				319	//println!("{:0>8b} {:0>8b} {:0>8b} -> {:0>16b}", 0xEDu8, fifth, sixth, s2);
				320	//println!("-> {:0>32b}", c);
				321	assert!(0x010000 <= c && c <= 0x10FFFF);
				322
				323	// Convert to UTF-8.
				324	// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
				325	[0b1111_0000u8 \| ((c & 0b1_1100_0000_0000_0000_0000) >> 18) as u8,
				326	TAG_CONT_U8 \| ((c & 0b0_0011_1111_0000_0000_0000) >> 12) as u8,
				327	TAG_CONT_U8 \| ((c & 0b0_0000_0000_1111_1100_0000) >> 6) as u8,
				328	TAG_CONT_U8 \| ((c & 0b0_0000_0000_0000_0011_1111) ) as u8]
				329	}
				330
				331	/// Convert a Rust `&str` to CESU-8 bytes.
				332	///
				333	/// ```
				334	/// use std::borrow::Cow;
				335	/// use cesu8::to_cesu8;
				336	///
				337	/// // This string is valid as UTF-8 or CESU-8, so it doesn't change,
				338	/// // and we can convert it without allocating memory.
				339	/// assert_eq!(Cow::Borrowed("aé日".as_bytes()), to_cesu8("aé日"));
				340	///
				341	/// // This string is a 4-byte UTF-8 string, which becomes a 6-byte CESU-8
				342	/// // vector.
				343	/// assert_eq!(Cow::Borrowed(&[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81]),
				344	/// to_cesu8("\u{10401}"));
				345	/// ```
				346	pub fn to_cesu8(text: &str) -> Cow<[u8]> {
				347	if is_valid_cesu8(text) {
				348	Cow::Borrowed(text.as_bytes())
				349	} else {
				350	Cow::Owned(to_cesu8_internal(text, Variant::Standard))
				351	}
				352	}
				353
				354	/// Convert a Rust `&str` to Java's modified UTF-8 bytes.
				355	///
				356	/// ```
				357	/// use std::borrow::Cow;
				358	/// use cesu8::to_java_cesu8;
				359	///
				360	/// // This string is valid as UTF-8 or CESU-8, so it doesn't change,
				361	/// // and we can convert it without allocating memory.
				362	/// assert_eq!(Cow::Borrowed("aé日".as_bytes()), to_java_cesu8("aé日"));
				363	///
				364	/// // This string is a 4-byte UTF-8 string, which becomes a 6-byte modified
				365	/// // UTF-8 vector.
				366	/// assert_eq!(Cow::Borrowed(&[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81]),
				367	/// to_java_cesu8("\u{10401}"));
				368	///
				369	/// // This string contains null, which becomes 2-byte modified UTF-8 encoding
				370	/// assert_eq!(Cow::Borrowed(&[0xC0, 0x80, 0xC0, 0x80]),
				371	/// to_java_cesu8("\0\0"));
				372	/// ```
				373	pub fn to_java_cesu8(text: &str) -> Cow<[u8]> {
				374	if is_valid_java_cesu8(text) {
				375	Cow::Borrowed(text.as_bytes())
				376	} else {
				377	Cow::Owned(to_cesu8_internal(text, Variant::Java))
				378	}
				379	}
				380
				381	fn to_cesu8_internal(text: &str, variant: Variant) -> Vec<u8> {
				382	let bytes = text.as_bytes();
				383	let mut encoded = Vec::with_capacity(bytes.len() + bytes.len() >> 2);
				384	let mut i = 0;
				385	while i < bytes.len() {
				386	let b = bytes[i];
				387	if variant == Variant::Java && b == 0 {
				388	encoded.push(0xc0);
				389	encoded.push(0x80);
				390	i += 1;
				391	} else if b < 128 {
				392	// Pass ASCII through quickly.
				393	encoded.push(b);
				394	i += 1;
				395	} else {
				396	// Figure out how many bytes we need for this character.
				397	let w = utf8_char_width(b);
				398	assert!(w <= 4);
				399	assert!(i + w <= bytes.len());
				400	if w != 4 {
				401	// Pass through short UTF-8 sequences unmodified.
				402	encoded.extend(bytes[i..i+w].iter().cloned());
				403	} else {
				404	// Encode 4-byte sequences as 6 bytes.
				405	let s = unsafe { from_utf8_unchecked(&bytes[i..i+w]) };
				406	let c = s.chars().next().unwrap() as u32 - 0x10000;
				407	let mut s: [u16; 2] = [0; 2];
				408	s[0] = ((c >> 10) as u16) \| 0xD800;
				409	s[1] = ((c & 0x3FF) as u16) \| 0xDC00;
				410	encoded.extend(enc_surrogate(s[0]).iter().cloned());
				411	encoded.extend(enc_surrogate(s[1]).iter().cloned());
				412	}
				413	i += w;
				414	}
				415	}
				416	encoded
				417	}
				418
				419	/// Check whether a Rust string contains valid CESU-8 data.
				420	pub fn is_valid_cesu8(text: &str) -> bool {
				421	// We rely on the fact that Rust strings are guaranteed to be valid
				422	// UTF-8.
				423	for b in text.bytes() {
				424	if (b & !CONT_MASK) == TAG_CONT_U8 { continue; }
				425	if utf8_char_width(b) > 3 { return false; }
				426	}
				427	true
				428	}
				429
				430	/// Check whether a Rust string contains valid Java's modified UTF-8 data.
				431	pub fn is_valid_java_cesu8(text: &str) -> bool {
				432	!text.contains('\0') && is_valid_cesu8(text)
				433	}
				434
				435	#[test]
				436	fn test_valid_cesu8() {
				437	assert!(is_valid_cesu8("aé日"));
				438	assert!(is_valid_java_cesu8("aé日"));
				439	assert!(!is_valid_cesu8("\u{10401}"));
				440	assert!(!is_valid_java_cesu8("\u{10401}"));
				441	assert!(is_valid_cesu8("\0\0"));
				442	assert!(!is_valid_java_cesu8("\0\0"));
				443	}
				444
				445
				446	/// Encode a single surrogate as CESU-8.
				447	fn enc_surrogate(surrogate: u16) -> [u8; 3] {
				448	assert!(0xD800 <= surrogate && surrogate <= 0xDFFF);
				449	// 1110xxxx 10xxxxxx 10xxxxxx
				450	[0b11100000 \| ((surrogate & 0b11110000_00000000) >> 12) as u8,
				451	TAG_CONT_U8 \| ((surrogate & 0b00001111_11000000) >> 6) as u8,
				452	TAG_CONT_U8 \| ((surrogate & 0b00000000_00111111) ) as u8]
				453	}