blob: 7fac5ffbe9a60c9604e8e3d990cc86b8bfbee249 [file] [log] [blame]
Ivan Lozanoa7e4bc02021-08-20 09:59:16 -04001// Copyright 2012-2014 The Rust Project Developers and Eric Kidd. See the
2// COPYRIGHT-RUST.txt file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed except
8// according to those terms.
9
10
11//! A simple library implementing the [CESU-8 compatibility encoding
12//! scheme](http://www.unicode.org/reports/tr26/tr26-2.html). This is a
13//! non-standard variant of UTF-8 that is used internally by some systems
14//! that need to represent UTF-16 data as 8-bit characters. Yes, this is
15//! ugly.
16//!
17//! Use of this encoding is discouraged by the Unicode Consortium. It's OK
18//! for working with existing internal APIs, but it should not be used for
19//! transmitting or storing data.
20//!
21//! ```
22//! use std::borrow::Cow;
23//! use cesu8::{from_cesu8, to_cesu8};
24//!
25//! // 16-bit Unicode characters are the same in UTF-8 and CESU-8.
26//! assert_eq!(Cow::Borrowed("aé日".as_bytes()),
27//! to_cesu8("aé日"));
28//! assert_eq!(Cow::Borrowed("aé日"),
29//! from_cesu8("aé日".as_bytes()).unwrap());
30//!
31//! // This string is CESU-8 data containing a 6-byte surrogate pair,
32//! // which decodes to a 4-byte UTF-8 string.
33//! let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
34//! assert_eq!(Cow::Borrowed("\u{10401}"),
35//! from_cesu8(data).unwrap());
36//! ```
37//!
38//! ### A note about security
39//!
40//! As a general rule, this library is intended to fail on malformed or
41//! unexpected input. CESU-8 is supposed to be an internal-only format,
42//! and if we're seeing malformed data, we assume that it's either a bug in
43//! somebody's code, or an attacker is trying to improperly encode data to
44//! evade security checks.
45//!
46//! If you have a use case for lossy conversion to UTF-8, or conversion
47//! from mixed UTF-8/CESU-8 data, please feel free to submit a pull request
48//! for `from_cesu8_lossy_permissive` with appropriate behavior.
49//!
50//! ### Java and U+0000, and other variants
51//!
52//! Java uses the CESU-8 encoding as described above, but with one
53//! difference: The null character U+0000 is represented as an overlong
54//! UTF-8 sequence `C0 80`. This is supported by the `from_java_cesu8` and
55//! `to_java_cesu8` methods.
56//!
57//! ### Surrogate pairs and UTF-8
58//!
59//! The UTF-16 encoding uses "surrogate pairs" to represent Unicode code
60//! points in the range from U+10000 to U+10FFFF. These are 16-bit numbers
61//! in the range 0xD800 to 0xDFFF.
62//!
63//! * 0xD800 to 0xDBFF: First half of surrogate pair. When encoded as
64//! CESU-8, these become **1110**1101 **10**100000 **10**000000 to
65//! **1110**1101 **10**101111 **10**111111.
66//!
67//! * 0xDC00 to 0xDFFF: Second half of surrogate pair. These become
68//! **1110**1101 **10**110000 **10**000000 to
69//! **1110**1101 **10**111111 **10**111111.
70//!
71//! Wikipedia [explains](http://en.wikipedia.org/wiki/UTF-16) the
72//! code point to UTF-16 conversion process:
73//!
74//! > Consider the encoding of U+10437 (𐐷):
75//! >
76//! > * Subtract 0x10000 from 0x10437. The result is 0x00437, 0000 0000 0100
77//! > 0011 0111.
78//! > * Split this into the high 10-bit value and the low 10-bit value:
79//! > 0000000001 and 0000110111.
80//! > * Add 0xD800 to the high value to form the high surrogate: 0xD800 +
81//! > 0x0001 = 0xD801.
82//! > * Add 0xDC00 to the low value to form the low surrogate: 0xDC00 +
83//! > 0x0037 = 0xDC37.
84
85#![warn(missing_docs)]
86
87
88use std::borrow::Cow;
89use std::error::Error;
90use std::fmt;
91use std::result::Result;
92use std::slice;
93use std::str::{from_utf8, from_utf8_unchecked};
94use unicode::utf8_char_width;
95
96mod unicode;
97
98/// Mask of the value bits of a continuation byte.
99const CONT_MASK: u8 = 0b0011_1111u8;
100/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte.
101const TAG_CONT_U8: u8 = 0b1000_0000u8;
102
103/// The CESU-8 data could not be decoded as valid UTF-8 data.
104#[derive(Clone, Copy, Debug)]
105pub struct Cesu8DecodingError;
106
107impl Error for Cesu8DecodingError {
108 fn description(&self) -> &str { "decoding error" }
109 fn cause(&self) -> Option<&Error> { None }
110}
111
112impl fmt::Display for Cesu8DecodingError {
113 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
114 write!(f, "could not convert CESU-8 data to UTF-8")
115 }
116}
117
118/// Which variant of the encoding are we working with?
119#[derive(PartialEq, Eq)]
120enum Variant {
121 /// Regular CESU-8, with '\0' represented by itself.
122 Standard,
123 /// This is technically Java's "Modified UTF-8", which is supposedly
124 /// like CESU-8, except that it UTF-8 encodes the '\0' byte. I'm sure
125 /// it seemed like a good idea at the time.
126 Java,
127}
128
129/// Convert CESU-8 data to a Rust string, re-encoding only if necessary.
130/// Returns an error if the data cannot be represented as valid UTF-8.
131///
132/// ```
133/// use std::borrow::Cow;
134/// use cesu8::from_cesu8;
135///
136/// // This string is valid as UTF-8 or CESU-8, so it doesn't change,
137/// // and we can convert it without allocating memory.
138/// assert_eq!(Cow::Borrowed("aé日"),
139/// from_cesu8("aé日".as_bytes()).unwrap());
140///
141/// // This string is CESU-8 data containing a 6-byte surrogate pair,
142/// // which becomes a 4-byte UTF-8 string.
143/// let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
144/// assert_eq!(Cow::Borrowed("\u{10401}"),
145/// from_cesu8(data).unwrap());
146/// ```
147pub fn from_cesu8(bytes: &[u8]) -> Result<Cow<str>, Cesu8DecodingError> {
148 from_cesu8_internal(bytes, Variant::Standard)
149}
150
151/// Convert Java's modified UTF-8 data to a Rust string, re-encoding only if
152/// necessary. Returns an error if the data cannot be represented as valid
153/// UTF-8.
154///
155/// ```
156/// use std::borrow::Cow;
157/// use cesu8::from_java_cesu8;
158///
159/// // This string is valid as UTF-8 or modified UTF-8, so it doesn't change,
160/// // and we can convert it without allocating memory.
161/// assert_eq!(Cow::Borrowed("aé日"),
162/// from_java_cesu8("aé日".as_bytes()).unwrap());
163///
164/// // This string is modified UTF-8 data containing a 6-byte surrogate pair,
165/// // which becomes a 4-byte UTF-8 string.
166/// let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
167/// assert_eq!(Cow::Borrowed("\u{10401}"),
168/// from_java_cesu8(data).unwrap());
169///
170/// // This string is modified UTF-8 data containing null code-points.
171/// let data = &[0xC0, 0x80, 0xC0, 0x80];
172/// assert_eq!(Cow::Borrowed("\0\0"),
173/// from_java_cesu8(data).unwrap());
174/// ```
175pub fn from_java_cesu8(bytes: &[u8]) -> Result<Cow<str>, Cesu8DecodingError> {
176 from_cesu8_internal(bytes, Variant::Java)
177}
178
179/// Do the actual work of decoding.
180fn from_cesu8_internal(bytes: &[u8], variant: Variant) ->
181 Result<Cow<str>, Cesu8DecodingError>
182{
183 match from_utf8(bytes) {
184 Ok(str) => Ok(Cow::Borrowed(str)),
185 _ => {
186 let mut decoded = Vec::with_capacity(bytes.len());
187 if decode_from_iter(&mut decoded, &mut bytes.iter(), variant) {
188 // Keep this assertion in debug mode only. It's important
189 // that this assertion is true, because Rust assumes that
190 // all UTF-8 strings are valid.
191 debug_assert!(from_utf8(&decoded[..]).is_ok());
192 Ok(Cow::Owned(unsafe { String::from_utf8_unchecked(decoded) }))
193 } else {
194 Err(Cesu8DecodingError)
195 }
196 }
197 }
198}
199
200#[test]
201fn test_from_cesu8() {
202 // The surrogate-encoded character below is from the ICU library's
203 // icu/source/test/testdata/conversion.txt test case.
204 let data = &[0x4D, 0xE6, 0x97, 0xA5, 0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81, 0x7F];
205 assert_eq!(Cow::Borrowed("M日\u{10401}\u{7F}"),
206 from_cesu8(data).unwrap());
207
208 // We used to have test data from the CESU-8 specification, but when we
209 // worked it through manually, we got the wrong answer:
210 //
211 // Input: [0xED, 0xAE, 0x80, 0xED, 0xB0, 0x80]
212 // Binary: 11101101 10101110 10000000 11101101 10110000 10000000
213 //
214 // 0b1101_101110_000000 -> 0xDB80
215 // 0b1101_110000_000000 -> 0xDC00
216 //
217 // ((0xDB80 - 0xD800) << 10) | (0xDC00 - 0xDC00) -> 0xE0000
218 // 0x10000 + 0xE0000 -> 0xF0000
219 //
220 // The spec claims that we are supposed to get 0x10000, not 0xF0000.
221 // Since I can't reconcile this example data with the text of the
222 // specification, I decided to use a test character from ICU instead.
223}
224
225// Our internal decoder, based on Rust's is_utf8 implementation.
226fn decode_from_iter(
227 decoded: &mut Vec<u8>, iter: &mut slice::Iter<u8>, variant: Variant)
228 -> bool
229{
230 macro_rules! err {
231 () => { return false }
232 }
233 macro_rules! next {
234 () => {
235 match iter.next() {
236 Some(a) => *a,
237 // We needed data, but there was none: error!
238 None => err!()
239 }
240 }
241 }
242 macro_rules! next_cont {
243 () => {
244 {
245 let byte = next!();
246 if (byte) & !CONT_MASK == TAG_CONT_U8 { byte } else { err!() }
247 }
248 }
249 }
250
251 loop {
252 let first = match iter.next() {
253 Some(&b) => b,
254 // We're at the end of the iterator and a codepoint boundary at
255 // the same time, so this string is valid.
256 None => return true
257 };
258
259 if variant == Variant::Java && first == 0 {
260 // Java's modified UTF-8 should never contain \0 directly.
261 err!();
262 } else if first < 128 {
263 // Pass ASCII through directly.
264 decoded.push(first);
265 } else if first == 0xc0 && variant == Variant::Java {
266 match next!() {
267 0x80 => decoded.push(0),
268 _ => err!(),
269 }
270 } else {
271 let w = utf8_char_width(first);
272 let second = next_cont!();
273 match w {
274 // Two-byte sequences can be used directly.
275 2 => { decoded.extend([first, second].iter().cloned()); }
276 3 => {
277 let third = next_cont!();
278 match (first, second) {
279 // These are valid UTF-8, so pass them through.
280 (0xE0 , 0xA0 ... 0xBF) |
281 (0xE1 ... 0xEC, 0x80 ... 0xBF) |
282 (0xED , 0x80 ... 0x9F) |
283 (0xEE ... 0xEF, 0x80 ... 0xBF) => {
284 decoded.extend([first, second, third].iter()
285 .cloned())
286 }
287 // First half a surrogate pair, so decode.
288 (0xED , 0xA0 ... 0xAF) => {
289 if next!() != 0xED { err!() }
290 let fifth = next_cont!();
291 if fifth < 0xB0 || 0xBF < fifth { err!() }
292 let sixth = next_cont!();
293 let s = dec_surrogates(second, third, fifth, sixth);
294 decoded.extend(s.iter().cloned());
295 }
296 _ => err!()
297 }
298 }
299 _ => err!()
300 }
301 }
302 }
303}
304
305/// Convert the two trailing bytes from a CESU-8 surrogate to a regular
306/// surrogate value.
307fn dec_surrogate(second: u8, third: u8) -> u32 {
308 0xD000u32 | ((second & CONT_MASK) as u32) << 6 | (third & CONT_MASK) as u32
309}
310
311/// Convert the bytes from a CESU-8 surrogate pair into a valid UTF-8
312/// sequence. Assumes input is valid.
313fn dec_surrogates(second: u8, third: u8, fifth: u8, sixth: u8) -> [u8; 4] {
314 // Convert to a 32-bit code point.
315 let s1 = dec_surrogate(second, third);
316 let s2 = dec_surrogate(fifth, sixth);
317 let c = 0x10000 + (((s1 - 0xD800) << 10) | (s2 - 0xDC00));
318 //println!("{:0>8b} {:0>8b} {:0>8b} -> {:0>16b}", 0xEDu8, second, third, s1);
319 //println!("{:0>8b} {:0>8b} {:0>8b} -> {:0>16b}", 0xEDu8, fifth, sixth, s2);
320 //println!("-> {:0>32b}", c);
321 assert!(0x010000 <= c && c <= 0x10FFFF);
322
323 // Convert to UTF-8.
324 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
325 [0b1111_0000u8 | ((c & 0b1_1100_0000_0000_0000_0000) >> 18) as u8,
326 TAG_CONT_U8 | ((c & 0b0_0011_1111_0000_0000_0000) >> 12) as u8,
327 TAG_CONT_U8 | ((c & 0b0_0000_0000_1111_1100_0000) >> 6) as u8,
328 TAG_CONT_U8 | ((c & 0b0_0000_0000_0000_0011_1111) ) as u8]
329}
330
331/// Convert a Rust `&str` to CESU-8 bytes.
332///
333/// ```
334/// use std::borrow::Cow;
335/// use cesu8::to_cesu8;
336///
337/// // This string is valid as UTF-8 or CESU-8, so it doesn't change,
338/// // and we can convert it without allocating memory.
339/// assert_eq!(Cow::Borrowed("aé日".as_bytes()), to_cesu8("aé日"));
340///
341/// // This string is a 4-byte UTF-8 string, which becomes a 6-byte CESU-8
342/// // vector.
343/// assert_eq!(Cow::Borrowed(&[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81]),
344/// to_cesu8("\u{10401}"));
345/// ```
346pub fn to_cesu8(text: &str) -> Cow<[u8]> {
347 if is_valid_cesu8(text) {
348 Cow::Borrowed(text.as_bytes())
349 } else {
350 Cow::Owned(to_cesu8_internal(text, Variant::Standard))
351 }
352}
353
354/// Convert a Rust `&str` to Java's modified UTF-8 bytes.
355///
356/// ```
357/// use std::borrow::Cow;
358/// use cesu8::to_java_cesu8;
359///
360/// // This string is valid as UTF-8 or CESU-8, so it doesn't change,
361/// // and we can convert it without allocating memory.
362/// assert_eq!(Cow::Borrowed("aé日".as_bytes()), to_java_cesu8("aé日"));
363///
364/// // This string is a 4-byte UTF-8 string, which becomes a 6-byte modified
365/// // UTF-8 vector.
366/// assert_eq!(Cow::Borrowed(&[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81]),
367/// to_java_cesu8("\u{10401}"));
368///
369/// // This string contains null, which becomes 2-byte modified UTF-8 encoding
370/// assert_eq!(Cow::Borrowed(&[0xC0, 0x80, 0xC0, 0x80]),
371/// to_java_cesu8("\0\0"));
372/// ```
373pub fn to_java_cesu8(text: &str) -> Cow<[u8]> {
374 if is_valid_java_cesu8(text) {
375 Cow::Borrowed(text.as_bytes())
376 } else {
377 Cow::Owned(to_cesu8_internal(text, Variant::Java))
378 }
379}
380
381fn to_cesu8_internal(text: &str, variant: Variant) -> Vec<u8> {
382 let bytes = text.as_bytes();
383 let mut encoded = Vec::with_capacity(bytes.len() + bytes.len() >> 2);
384 let mut i = 0;
385 while i < bytes.len() {
386 let b = bytes[i];
387 if variant == Variant::Java && b == 0 {
388 encoded.push(0xc0);
389 encoded.push(0x80);
390 i += 1;
391 } else if b < 128 {
392 // Pass ASCII through quickly.
393 encoded.push(b);
394 i += 1;
395 } else {
396 // Figure out how many bytes we need for this character.
397 let w = utf8_char_width(b);
398 assert!(w <= 4);
399 assert!(i + w <= bytes.len());
400 if w != 4 {
401 // Pass through short UTF-8 sequences unmodified.
402 encoded.extend(bytes[i..i+w].iter().cloned());
403 } else {
404 // Encode 4-byte sequences as 6 bytes.
405 let s = unsafe { from_utf8_unchecked(&bytes[i..i+w]) };
406 let c = s.chars().next().unwrap() as u32 - 0x10000;
407 let mut s: [u16; 2] = [0; 2];
408 s[0] = ((c >> 10) as u16) | 0xD800;
409 s[1] = ((c & 0x3FF) as u16) | 0xDC00;
410 encoded.extend(enc_surrogate(s[0]).iter().cloned());
411 encoded.extend(enc_surrogate(s[1]).iter().cloned());
412 }
413 i += w;
414 }
415 }
416 encoded
417}
418
419/// Check whether a Rust string contains valid CESU-8 data.
420pub fn is_valid_cesu8(text: &str) -> bool {
421 // We rely on the fact that Rust strings are guaranteed to be valid
422 // UTF-8.
423 for b in text.bytes() {
424 if (b & !CONT_MASK) == TAG_CONT_U8 { continue; }
425 if utf8_char_width(b) > 3 { return false; }
426 }
427 true
428}
429
430/// Check whether a Rust string contains valid Java's modified UTF-8 data.
431pub fn is_valid_java_cesu8(text: &str) -> bool {
432 !text.contains('\0') && is_valid_cesu8(text)
433}
434
435#[test]
436fn test_valid_cesu8() {
437 assert!(is_valid_cesu8("aé日"));
438 assert!(is_valid_java_cesu8("aé日"));
439 assert!(!is_valid_cesu8("\u{10401}"));
440 assert!(!is_valid_java_cesu8("\u{10401}"));
441 assert!(is_valid_cesu8("\0\0"));
442 assert!(!is_valid_java_cesu8("\0\0"));
443}
444
445
446/// Encode a single surrogate as CESU-8.
447fn enc_surrogate(surrogate: u16) -> [u8; 3] {
448 assert!(0xD800 <= surrogate && surrogate <= 0xDFFF);
449 // 1110xxxx 10xxxxxx 10xxxxxx
450 [0b11100000 | ((surrogate & 0b11110000_00000000) >> 12) as u8,
451 TAG_CONT_U8 | ((surrogate & 0b00001111_11000000) >> 6) as u8,
452 TAG_CONT_U8 | ((surrogate & 0b00000000_00111111) ) as u8]
453}