src/lib.rs - platform/external/rust/crates/cesu8 - Gitiles

 // Copyright 2012-2014 The Rust Project Developers and Eric Kidd.  See the
 // COPYRIGHT-RUST.txt file at the top-level directory of this distribution.
 //
 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 // option. This file may not be copied, modified, or distributed except
 // according to those terms.


 //! A simple library implementing the [CESU-8 compatibility encoding
 //! scheme](http://www.unicode.org/reports/tr26/tr26-2.html).  This is a
 //! non-standard variant of UTF-8 that is used internally by some systems
 //! that need to represent UTF-16 data as 8-bit characters.  Yes, this is
 //! ugly.
 //!
 //! Use of this encoding is discouraged by the Unicode Consortium.  It's OK
 //! for working with existing internal APIs, but it should not be used for
 //! transmitting or storing data.
 //!
 //! ```
 //! use std::borrow::Cow;
 //! use cesu8::{from_cesu8, to_cesu8};
 //!
 //! // 16-bit Unicode characters are the same in UTF-8 and CESU-8.
 //! assert_eq!(Cow::Borrowed("aé日".as_bytes()),
 //!            to_cesu8("aé日"));
 //! assert_eq!(Cow::Borrowed("aé日"),
 //!            from_cesu8("aé日".as_bytes()).unwrap());
 //!
 //! // This string is CESU-8 data containing a 6-byte surrogate pair,
 //! // which decodes to a 4-byte UTF-8 string.
 //! let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
 //! assert_eq!(Cow::Borrowed("\u{10401}"),
 //!            from_cesu8(data).unwrap());
 //! ```
 //!
 //! ### A note about security
 //!
 //! As a general rule, this library is intended to fail on malformed or
 //! unexpected input.  CESU-8 is supposed to be an internal-only format,
 //! and if we're seeing malformed data, we assume that it's either a bug in
 //! somebody's code, or an attacker is trying to improperly encode data to
 //! evade security checks.
 //!
 //! If you have a use case for lossy conversion to UTF-8, or conversion
 //! from mixed UTF-8/CESU-8 data, please feel free to submit a pull request
 //! for `from_cesu8_lossy_permissive` with appropriate behavior.
 //!
 //! ### Java and U+0000, and other variants
 //!
 //! Java uses the CESU-8 encoding as described above, but with one
 //! difference: The null character U+0000 is represented as an overlong
 //! UTF-8 sequence `C0 80`. This is supported by the `from_java_cesu8` and
 //! `to_java_cesu8` methods.
 //!
 //! ### Surrogate pairs and UTF-8
 //!
 //! The UTF-16 encoding uses "surrogate pairs" to represent Unicode code
 //! points in the range from U+10000 to U+10FFFF.  These are 16-bit numbers
 //! in the range 0xD800 to 0xDFFF.
 //!
 //! * 0xD800 to 0xDBFF: First half of surrogate pair.  When encoded as
 //!   CESU-8, these become **1110**1101 **10**100000 **10**000000 to
 //!   **1110**1101 **10**101111 **10**111111.
 //!
 //! * 0xDC00 to 0xDFFF: Second half of surrogate pair.  These become
 //!   **1110**1101 **10**110000 **10**000000 to
 //!   **1110**1101 **10**111111 **10**111111.
 //!
 //! Wikipedia [explains](http://en.wikipedia.org/wiki/UTF-16) the
 //! code point to UTF-16 conversion process:
 //!
 //! > Consider the encoding of U+10437 (𐐷):
 //! >
 //! > * Subtract 0x10000 from 0x10437. The result is 0x00437, 0000 0000 0100
 //! >   0011 0111.
 //! > * Split this into the high 10-bit value and the low 10-bit value:
 //! >   0000000001 and 0000110111.
 //! > * Add 0xD800 to the high value to form the high surrogate: 0xD800 +
 //! >   0x0001 = 0xD801.
 //! > * Add 0xDC00 to the low value to form the low surrogate: 0xDC00 +
 //! >   0x0037 = 0xDC37.

 #![warn(missing_docs)]


 use std::borrow::Cow;
 use std::error::Error;
 use std::fmt;
 use std::result::Result;
 use std::slice;
 use std::str::{from_utf8, from_utf8_unchecked};
 use unicode::utf8_char_width;

 mod unicode;

 /// Mask of the value bits of a continuation byte.
 const CONT_MASK: u8 = 0b0011_1111u8;
 /// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte.
 const TAG_CONT_U8: u8 = 0b1000_0000u8;

 /// The CESU-8 data could not be decoded as valid UTF-8 data.
 #[derive(Clone, Copy, Debug)]
 pub struct Cesu8DecodingError;

 impl Error for Cesu8DecodingError {
     fn description(&self) -> &str { "decoding error" }
     fn cause(&self) -> Option<&Error> { None }
 }

 impl fmt::Display for Cesu8DecodingError {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         write!(f, "could not convert CESU-8 data to UTF-8")
     }
 }

 /// Which variant of the encoding are we working with?
 #[derive(PartialEq, Eq)]
 enum Variant {
     /// Regular CESU-8, with '\0' represented by itself.
     Standard,
     /// This is technically Java's "Modified UTF-8", which is supposedly
     /// like CESU-8, except that it UTF-8 encodes the '\0' byte.  I'm sure
     /// it seemed like a good idea at the time.
     Java,
 }

 /// Convert CESU-8 data to a Rust string, re-encoding only if necessary.
 /// Returns an error if the data cannot be represented as valid UTF-8.
 ///
 /// ```
 /// use std::borrow::Cow;
 /// use cesu8::from_cesu8;
 ///
 /// // This string is valid as UTF-8 or CESU-8, so it doesn't change,
 /// // and we can convert it without allocating memory.
 /// assert_eq!(Cow::Borrowed("aé日"),
 ///            from_cesu8("aé日".as_bytes()).unwrap());
 ///
 /// // This string is CESU-8 data containing a 6-byte surrogate pair,
 /// // which becomes a 4-byte UTF-8 string.
 /// let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
 /// assert_eq!(Cow::Borrowed("\u{10401}"),
 ///            from_cesu8(data).unwrap());
 /// ```
 pub fn from_cesu8(bytes: &[u8]) -> Result<Cow<str>, Cesu8DecodingError> {
     from_cesu8_internal(bytes, Variant::Standard)
 }

 /// Convert Java's modified UTF-8 data to a Rust string, re-encoding only if
 /// necessary. Returns an error if the data cannot be represented as valid
 /// UTF-8.
 ///
 /// ```
 /// use std::borrow::Cow;
 /// use cesu8::from_java_cesu8;
 ///
 /// // This string is valid as UTF-8 or modified UTF-8, so it doesn't change,
 /// // and we can convert it without allocating memory.
 /// assert_eq!(Cow::Borrowed("aé日"),
 ///            from_java_cesu8("aé日".as_bytes()).unwrap());
 ///
 /// // This string is modified UTF-8 data containing a 6-byte surrogate pair,
 /// // which becomes a 4-byte UTF-8 string.
 /// let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
 /// assert_eq!(Cow::Borrowed("\u{10401}"),
 ///            from_java_cesu8(data).unwrap());
 ///
 /// // This string is modified UTF-8 data containing null code-points.
 /// let data = &[0xC0, 0x80, 0xC0, 0x80];
 /// assert_eq!(Cow::Borrowed("\0\0"),
 ///            from_java_cesu8(data).unwrap());
 /// ```
 pub fn from_java_cesu8(bytes: &[u8]) -> Result<Cow<str>, Cesu8DecodingError> {
     from_cesu8_internal(bytes, Variant::Java)
 }

 /// Do the actual work of decoding.
 fn from_cesu8_internal(bytes: &[u8], variant: Variant) ->
     Result<Cow<str>, Cesu8DecodingError>
 {
     match from_utf8(bytes) {
         Ok(str) => Ok(Cow::Borrowed(str)),
         _ => {
             let mut decoded = Vec::with_capacity(bytes.len());
             if decode_from_iter(&mut decoded, &mut bytes.iter(), variant) {
                 // Keep this assertion in debug mode only.  It's important
                 // that this assertion is true, because Rust assumes that
                 // all UTF-8 strings are valid.
                 debug_assert!(from_utf8(&decoded[..]).is_ok());
                 Ok(Cow::Owned(unsafe { String::from_utf8_unchecked(decoded) }))
             } else {
                 Err(Cesu8DecodingError)
             }
         }
     }
 }

 #[test]
 fn test_from_cesu8() {
     // The surrogate-encoded character below is from the ICU library's
     // icu/source/test/testdata/conversion.txt test case.
     let data = &[0x4D, 0xE6, 0x97, 0xA5, 0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81, 0x7F];
     assert_eq!(Cow::Borrowed("M日\u{10401}\u{7F}"),
                from_cesu8(data).unwrap());

     // We used to have test data from the CESU-8 specification, but when we
     // worked it through manually, we got the wrong answer:
     //
     // Input: [0xED, 0xAE, 0x80, 0xED, 0xB0, 0x80]
     // Binary: 11101101 10101110 10000000 11101101 10110000 10000000
     //
     // 0b1101_101110_000000 -> 0xDB80
     // 0b1101_110000_000000 -> 0xDC00
     //
     // ((0xDB80 - 0xD800) << 10) | (0xDC00 - 0xDC00) -> 0xE0000
     // 0x10000 + 0xE0000 -> 0xF0000
     //
     // The spec claims that we are supposed to get 0x10000, not 0xF0000.
     // Since I can't reconcile this example data with the text of the
     // specification, I decided to use a test character from ICU instead.
 }

 // Our internal decoder, based on Rust's is_utf8 implementation.
 fn decode_from_iter(
     decoded: &mut Vec<u8>, iter: &mut slice::Iter<u8>, variant: Variant)
     -> bool
 {
     macro_rules! err {
         () => { return false }
     }
     macro_rules! next {
         () => {
             match iter.next() {
                 Some(a) => *a,
                 // We needed data, but there was none: error!
                 None => err!()
             }
         }
     }
     macro_rules! next_cont {
         () => {
             {
                 let byte = next!();
                 if (byte) & !CONT_MASK == TAG_CONT_U8 { byte } else { err!() }
             }
         }
     }

     loop {
         let first = match iter.next() {
             Some(&b) => b,
             // We're at the end of the iterator and a codepoint boundary at
             // the same time, so this string is valid.
             None => return true
         };

         if variant == Variant::Java && first == 0 {
             // Java's modified UTF-8 should never contain \0 directly.
             err!();
         } else if first < 128 {
             // Pass ASCII through directly.
             decoded.push(first);
         } else if first == 0xc0 && variant == Variant::Java {
             match next!() {
                 0x80 => decoded.push(0),
                 _ => err!(),
             }
         } else {
             let w = utf8_char_width(first);
             let second = next_cont!();
             match w {
                 // Two-byte sequences can be used directly.
                 2 => { decoded.extend([first, second].iter().cloned()); }
                 3 => {
                     let third = next_cont!();
                     match (first, second) {
                         // These are valid UTF-8, so pass them through.
                         (0xE0         , 0xA0 ... 0xBF) |
                         (0xE1 ... 0xEC, 0x80 ... 0xBF) |
                         (0xED         , 0x80 ... 0x9F) |
                         (0xEE ... 0xEF, 0x80 ... 0xBF) => {
                             decoded.extend([first, second, third].iter()
                                                .cloned())
                         }
                         // First half a surrogate pair, so decode.
                         (0xED         , 0xA0 ... 0xAF) => {
                             if next!() != 0xED { err!() }
                             let fifth = next_cont!();
                             if fifth < 0xB0 || 0xBF < fifth { err!() }
                             let sixth = next_cont!();
                             let s = dec_surrogates(second, third, fifth, sixth);
                             decoded.extend(s.iter().cloned());
                         }
                         _ => err!()
                     }
                 }
                 _ => err!()
             }
         }
     }
 }

 /// Convert the two trailing bytes from a CESU-8 surrogate to a regular
 /// surrogate value.
 fn dec_surrogate(second: u8, third: u8) -> u32 {
     0xD000u32 | ((second & CONT_MASK) as u32) << 6 | (third & CONT_MASK) as u32
 }

 /// Convert the bytes from a CESU-8 surrogate pair into a valid UTF-8
 /// sequence.  Assumes input is valid.
 fn dec_surrogates(second: u8, third: u8, fifth: u8, sixth: u8) -> [u8; 4] {
     // Convert to a 32-bit code point.
     let s1 = dec_surrogate(second, third);
     let s2 = dec_surrogate(fifth, sixth);
     let c = 0x10000 + (((s1 - 0xD800) << 10) | (s2 - 0xDC00));
     //println!("{:0>8b} {:0>8b} {:0>8b} -> {:0>16b}", 0xEDu8, second, third, s1);
     //println!("{:0>8b} {:0>8b} {:0>8b} -> {:0>16b}", 0xEDu8, fifth, sixth, s2);
     //println!("-> {:0>32b}", c);
     assert!(0x010000 <= c && c <= 0x10FFFF);

     // Convert to UTF-8.
     // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
     [0b1111_0000u8 | ((c & 0b1_1100_0000_0000_0000_0000) >> 18) as u8,
      TAG_CONT_U8   | ((c & 0b0_0011_1111_0000_0000_0000) >> 12) as u8,
      TAG_CONT_U8   | ((c & 0b0_0000_0000_1111_1100_0000) >>  6) as u8,
      TAG_CONT_U8   | ((c & 0b0_0000_0000_0000_0011_1111)      ) as u8]
 }

 /// Convert a Rust `&str` to CESU-8 bytes.
 ///
 /// ```
 /// use std::borrow::Cow;
 /// use cesu8::to_cesu8;
 ///
 /// // This string is valid as UTF-8 or CESU-8, so it doesn't change,
 /// // and we can convert it without allocating memory.
 /// assert_eq!(Cow::Borrowed("aé日".as_bytes()), to_cesu8("aé日"));
 ///
 /// // This string is a 4-byte UTF-8 string, which becomes a 6-byte CESU-8
 /// // vector.
 /// assert_eq!(Cow::Borrowed(&[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81]),
 ///            to_cesu8("\u{10401}"));
 /// ```
 pub fn to_cesu8(text: &str) -> Cow<[u8]> {
     if is_valid_cesu8(text) {
         Cow::Borrowed(text.as_bytes())
     } else {
         Cow::Owned(to_cesu8_internal(text, Variant::Standard))
     }
 }

 /// Convert a Rust `&str` to Java's modified UTF-8 bytes.
 ///
 /// ```
 /// use std::borrow::Cow;
 /// use cesu8::to_java_cesu8;
 ///
 /// // This string is valid as UTF-8 or CESU-8, so it doesn't change,
 /// // and we can convert it without allocating memory.
 /// assert_eq!(Cow::Borrowed("aé日".as_bytes()), to_java_cesu8("aé日"));
 ///
 /// // This string is a 4-byte UTF-8 string, which becomes a 6-byte modified
 /// // UTF-8 vector.
 /// assert_eq!(Cow::Borrowed(&[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81]),
 ///            to_java_cesu8("\u{10401}"));
 ///
 /// // This string contains null, which becomes 2-byte modified UTF-8 encoding
 /// assert_eq!(Cow::Borrowed(&[0xC0, 0x80, 0xC0, 0x80]),
 ///            to_java_cesu8("\0\0"));
 /// ```
 pub fn to_java_cesu8(text: &str) -> Cow<[u8]> {
     if is_valid_java_cesu8(text) {
         Cow::Borrowed(text.as_bytes())
     } else {
         Cow::Owned(to_cesu8_internal(text, Variant::Java))
     }
 }

 fn to_cesu8_internal(text: &str, variant: Variant) -> Vec<u8> {
     let bytes = text.as_bytes();
     let mut encoded = Vec::with_capacity(bytes.len() + bytes.len() >> 2);
     let mut i = 0;
     while i < bytes.len() {
         let b = bytes[i];
         if variant == Variant::Java && b == 0 {
             encoded.push(0xc0);
             encoded.push(0x80);
             i += 1;
         } else if b < 128 {
             // Pass ASCII through quickly.
             encoded.push(b);
             i += 1;
         } else {
             // Figure out how many bytes we need for this character.
             let w = utf8_char_width(b);
             assert!(w <= 4);
             assert!(i + w <= bytes.len());
             if w != 4 {
                 // Pass through short UTF-8 sequences unmodified.
                 encoded.extend(bytes[i..i+w].iter().cloned());
             } else {
                 // Encode 4-byte sequences as 6 bytes.
                 let s = unsafe { from_utf8_unchecked(&bytes[i..i+w]) };
                 let c = s.chars().next().unwrap() as u32 - 0x10000;
                 let mut s: [u16; 2] = [0; 2];
                 s[0] = ((c >> 10) as u16)   | 0xD800;
                 s[1] = ((c & 0x3FF) as u16) | 0xDC00;
                 encoded.extend(enc_surrogate(s[0]).iter().cloned());
                 encoded.extend(enc_surrogate(s[1]).iter().cloned());
             }
             i += w;
         }
     }
     encoded
 }

 /// Check whether a Rust string contains valid CESU-8 data.
 pub fn is_valid_cesu8(text: &str) -> bool {
     // We rely on the fact that Rust strings are guaranteed to be valid
     // UTF-8.
     for b in text.bytes() {
         if (b & !CONT_MASK) == TAG_CONT_U8 { continue; }
         if utf8_char_width(b) > 3 { return false; }
     }
     true
 }

 /// Check whether a Rust string contains valid Java's modified UTF-8 data.
 pub fn is_valid_java_cesu8(text: &str) -> bool {
     !text.contains('\0') && is_valid_cesu8(text)
 }

 #[test]
 fn test_valid_cesu8() {
     assert!(is_valid_cesu8("aé日"));
     assert!(is_valid_java_cesu8("aé日"));
     assert!(!is_valid_cesu8("\u{10401}"));
     assert!(!is_valid_java_cesu8("\u{10401}"));
     assert!(is_valid_cesu8("\0\0"));
     assert!(!is_valid_java_cesu8("\0\0"));
 }


 /// Encode a single surrogate as CESU-8.
 fn enc_surrogate(surrogate: u16) -> [u8; 3] {
     assert!(0xD800 <= surrogate && surrogate <= 0xDFFF);
     // 1110xxxx 10xxxxxx 10xxxxxx
     [0b11100000  | ((surrogate & 0b11110000_00000000) >> 12) as u8,
      TAG_CONT_U8 | ((surrogate & 0b00001111_11000000) >>  6) as u8,
      TAG_CONT_U8 | ((surrogate & 0b00000000_00111111)      ) as u8]
 }
	// Copyright 2012-2014 The Rust Project Developers and Eric Kidd. See the
	// COPYRIGHT-RUST.txt file at the top-level directory of this distribution.
	//
	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
	// option. This file may not be copied, modified, or distributed except
	// according to those terms.


	//! A simple library implementing the [CESU-8 compatibility encoding
	//! scheme](http://www.unicode.org/reports/tr26/tr26-2.html). This is a
	//! non-standard variant of UTF-8 that is used internally by some systems
	//! that need to represent UTF-16 data as 8-bit characters. Yes, this is
	//! ugly.
	//!
	//! Use of this encoding is discouraged by the Unicode Consortium. It's OK
	//! for working with existing internal APIs, but it should not be used for
	//! transmitting or storing data.
	//!
	//! ```
	//! use std::borrow::Cow;
	//! use cesu8::{from_cesu8, to_cesu8};
	//!
	//! // 16-bit Unicode characters are the same in UTF-8 and CESU-8.
	//! assert_eq!(Cow::Borrowed("aé日".as_bytes()),
	//! to_cesu8("aé日"));
	//! assert_eq!(Cow::Borrowed("aé日"),
	//! from_cesu8("aé日".as_bytes()).unwrap());
	//!
	//! // This string is CESU-8 data containing a 6-byte surrogate pair,
	//! // which decodes to a 4-byte UTF-8 string.
	//! let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
	//! assert_eq!(Cow::Borrowed("\u{10401}"),
	//! from_cesu8(data).unwrap());
	//! ```
	//!
	//! ### A note about security
	//!
	//! As a general rule, this library is intended to fail on malformed or
	//! unexpected input. CESU-8 is supposed to be an internal-only format,
	//! and if we're seeing malformed data, we assume that it's either a bug in
	//! somebody's code, or an attacker is trying to improperly encode data to
	//! evade security checks.
	//!
	//! If you have a use case for lossy conversion to UTF-8, or conversion
	//! from mixed UTF-8/CESU-8 data, please feel free to submit a pull request
	//! for `from_cesu8_lossy_permissive` with appropriate behavior.
	//!
	//! ### Java and U+0000, and other variants
	//!
	//! Java uses the CESU-8 encoding as described above, but with one
	//! difference: The null character U+0000 is represented as an overlong
	//! UTF-8 sequence `C0 80`. This is supported by the `from_java_cesu8` and
	//! `to_java_cesu8` methods.
	//!
	//! ### Surrogate pairs and UTF-8
	//!
	//! The UTF-16 encoding uses "surrogate pairs" to represent Unicode code
	//! points in the range from U+10000 to U+10FFFF. These are 16-bit numbers
	//! in the range 0xD800 to 0xDFFF.
	//!
	//! * 0xD800 to 0xDBFF: First half of surrogate pair. When encoded as
	//! CESU-8, these become 11101101 10100000 10000000 to
	//! 11101101 10101111 10111111.
	//!
	//! * 0xDC00 to 0xDFFF: Second half of surrogate pair. These become
	//! 11101101 10110000 10000000 to
	//! 11101101 10111111 10111111.
	//!
	//! Wikipedia [explains](http://en.wikipedia.org/wiki/UTF-16) the
	//! code point to UTF-16 conversion process:
	//!
	//! > Consider the encoding of U+10437 (𐐷):
	//! >
	//! > * Subtract 0x10000 from 0x10437. The result is 0x00437, 0000 0000 0100
	//! > 0011 0111.
	//! > * Split this into the high 10-bit value and the low 10-bit value:
	//! > 0000000001 and 0000110111.
	//! > * Add 0xD800 to the high value to form the high surrogate: 0xD800 +
	//! > 0x0001 = 0xD801.
	//! > * Add 0xDC00 to the low value to form the low surrogate: 0xDC00 +
	//! > 0x0037 = 0xDC37.

	#![warn(missing_docs)]


	use std::borrow::Cow;
	use std::error::Error;
	use std::fmt;
	use std::result::Result;
	use std::slice;
	use std::str::{from_utf8, from_utf8_unchecked};
	use unicode::utf8_char_width;

	mod unicode;

	/// Mask of the value bits of a continuation byte.
	const CONT_MASK: u8 = 0b0011_1111u8;
	/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte.
	const TAG_CONT_U8: u8 = 0b1000_0000u8;

	/// The CESU-8 data could not be decoded as valid UTF-8 data.
	#[derive(Clone, Copy, Debug)]
	pub struct Cesu8DecodingError;

	impl Error for Cesu8DecodingError {
	fn description(&self) -> &str { "decoding error" }
	fn cause(&self) -> Option<&Error> { None }
	}

	impl fmt::Display for Cesu8DecodingError {
	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
	write!(f, "could not convert CESU-8 data to UTF-8")
	}
	}

	/// Which variant of the encoding are we working with?
	#[derive(PartialEq, Eq)]
	enum Variant {
	/// Regular CESU-8, with '\0' represented by itself.
	Standard,
	/// This is technically Java's "Modified UTF-8", which is supposedly
	/// like CESU-8, except that it UTF-8 encodes the '\0' byte. I'm sure
	/// it seemed like a good idea at the time.
	Java,
	}

	/// Convert CESU-8 data to a Rust string, re-encoding only if necessary.
	/// Returns an error if the data cannot be represented as valid UTF-8.
	///
	/// ```
	/// use std::borrow::Cow;
	/// use cesu8::from_cesu8;
	///
	/// // This string is valid as UTF-8 or CESU-8, so it doesn't change,
	/// // and we can convert it without allocating memory.
	/// assert_eq!(Cow::Borrowed("aé日"),
	/// from_cesu8("aé日".as_bytes()).unwrap());
	///
	/// // This string is CESU-8 data containing a 6-byte surrogate pair,
	/// // which becomes a 4-byte UTF-8 string.
	/// let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
	/// assert_eq!(Cow::Borrowed("\u{10401}"),
	/// from_cesu8(data).unwrap());
	/// ```
	pub fn from_cesu8(bytes: &[u8]) -> Result<Cow<str>, Cesu8DecodingError> {
	from_cesu8_internal(bytes, Variant::Standard)
	}

	/// Convert Java's modified UTF-8 data to a Rust string, re-encoding only if
	/// necessary. Returns an error if the data cannot be represented as valid
	/// UTF-8.
	///
	/// ```
	/// use std::borrow::Cow;
	/// use cesu8::from_java_cesu8;
	///
	/// // This string is valid as UTF-8 or modified UTF-8, so it doesn't change,
	/// // and we can convert it without allocating memory.
	/// assert_eq!(Cow::Borrowed("aé日"),
	/// from_java_cesu8("aé日".as_bytes()).unwrap());
	///
	/// // This string is modified UTF-8 data containing a 6-byte surrogate pair,
	/// // which becomes a 4-byte UTF-8 string.
	/// let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
	/// assert_eq!(Cow::Borrowed("\u{10401}"),
	/// from_java_cesu8(data).unwrap());
	///
	/// // This string is modified UTF-8 data containing null code-points.
	/// let data = &[0xC0, 0x80, 0xC0, 0x80];
	/// assert_eq!(Cow::Borrowed("\0\0"),
	/// from_java_cesu8(data).unwrap());
	/// ```
	pub fn from_java_cesu8(bytes: &[u8]) -> Result<Cow<str>, Cesu8DecodingError> {
	from_cesu8_internal(bytes, Variant::Java)
	}

	/// Do the actual work of decoding.
	fn from_cesu8_internal(bytes: &[u8], variant: Variant) ->
	Result<Cow<str>, Cesu8DecodingError>
	{
	match from_utf8(bytes) {
	Ok(str) => Ok(Cow::Borrowed(str)),
	_ => {
	let mut decoded = Vec::with_capacity(bytes.len());
	if decode_from_iter(&mut decoded, &mut bytes.iter(), variant) {
	// Keep this assertion in debug mode only. It's important
	// that this assertion is true, because Rust assumes that
	// all UTF-8 strings are valid.
	debug_assert!(from_utf8(&decoded[..]).is_ok());
	Ok(Cow::Owned(unsafe { String::from_utf8_unchecked(decoded) }))
	} else {
	Err(Cesu8DecodingError)
	}
	}
	}
	}

	#[test]
	fn test_from_cesu8() {
	// The surrogate-encoded character below is from the ICU library's
	// icu/source/test/testdata/conversion.txt test case.
	let data = &[0x4D, 0xE6, 0x97, 0xA5, 0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81, 0x7F];
	assert_eq!(Cow::Borrowed("M日\u{10401}\u{7F}"),
	from_cesu8(data).unwrap());

	// We used to have test data from the CESU-8 specification, but when we
	// worked it through manually, we got the wrong answer:
	//
	// Input: [0xED, 0xAE, 0x80, 0xED, 0xB0, 0x80]
	// Binary: 11101101 10101110 10000000 11101101 10110000 10000000
	//
	// 0b1101_101110_000000 -> 0xDB80
	// 0b1101_110000_000000 -> 0xDC00
	//
	// ((0xDB80 - 0xD800) << 10) \| (0xDC00 - 0xDC00) -> 0xE0000
	// 0x10000 + 0xE0000 -> 0xF0000
	//
	// The spec claims that we are supposed to get 0x10000, not 0xF0000.
	// Since I can't reconcile this example data with the text of the
	// specification, I decided to use a test character from ICU instead.
	}

	// Our internal decoder, based on Rust's is_utf8 implementation.
	fn decode_from_iter(
	decoded: &mut Vec<u8>, iter: &mut slice::Iter<u8>, variant: Variant)
	-> bool
	{
	macro_rules! err {
	() => { return false }
	}
	macro_rules! next {
	() => {
	match iter.next() {
	Some(a) => *a,
	// We needed data, but there was none: error!
	None => err!()
	}
	}
	}
	macro_rules! next_cont {
	() => {
	{
	let byte = next!();
	if (byte) & !CONT_MASK == TAG_CONT_U8 { byte } else { err!() }
	}
	}
	}

	loop {
	let first = match iter.next() {
	Some(&b) => b,
	// We're at the end of the iterator and a codepoint boundary at
	// the same time, so this string is valid.
	None => return true
	};

	if variant == Variant::Java && first == 0 {
	// Java's modified UTF-8 should never contain \0 directly.
	err!();
	} else if first < 128 {
	// Pass ASCII through directly.
	decoded.push(first);
	} else if first == 0xc0 && variant == Variant::Java {
	match next!() {
	0x80 => decoded.push(0),
	_ => err!(),
	}
	} else {
	let w = utf8_char_width(first);
	let second = next_cont!();
	match w {
	// Two-byte sequences can be used directly.
	2 => { decoded.extend([first, second].iter().cloned()); }
	3 => {
	let third = next_cont!();
	match (first, second) {
	// These are valid UTF-8, so pass them through.
	(0xE0 , 0xA0 ... 0xBF) \|
	(0xE1 ... 0xEC, 0x80 ... 0xBF) \|
	(0xED , 0x80 ... 0x9F) \|
	(0xEE ... 0xEF, 0x80 ... 0xBF) => {
	decoded.extend([first, second, third].iter()
	.cloned())
	}
	// First half a surrogate pair, so decode.
	(0xED , 0xA0 ... 0xAF) => {
	if next!() != 0xED { err!() }
	let fifth = next_cont!();
	if fifth < 0xB0 \|\| 0xBF < fifth { err!() }
	let sixth = next_cont!();
	let s = dec_surrogates(second, third, fifth, sixth);
	decoded.extend(s.iter().cloned());
	}
	_ => err!()
	}
	}
	_ => err!()
	}
	}
	}
	}

	/// Convert the two trailing bytes from a CESU-8 surrogate to a regular
	/// surrogate value.
	fn dec_surrogate(second: u8, third: u8) -> u32 {
	0xD000u32 \| ((second & CONT_MASK) as u32) << 6 \| (third & CONT_MASK) as u32
	}

	/// Convert the bytes from a CESU-8 surrogate pair into a valid UTF-8
	/// sequence. Assumes input is valid.
	fn dec_surrogates(second: u8, third: u8, fifth: u8, sixth: u8) -> [u8; 4] {
	// Convert to a 32-bit code point.
	let s1 = dec_surrogate(second, third);
	let s2 = dec_surrogate(fifth, sixth);
	let c = 0x10000 + (((s1 - 0xD800) << 10) \| (s2 - 0xDC00));
	//println!("{:0>8b} {:0>8b} {:0>8b} -> {:0>16b}", 0xEDu8, second, third, s1);
	//println!("{:0>8b} {:0>8b} {:0>8b} -> {:0>16b}", 0xEDu8, fifth, sixth, s2);
	//println!("-> {:0>32b}", c);
	assert!(0x010000 <= c && c <= 0x10FFFF);

	// Convert to UTF-8.
	// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
	[0b1111_0000u8 \| ((c & 0b1_1100_0000_0000_0000_0000) >> 18) as u8,
	TAG_CONT_U8 \| ((c & 0b0_0011_1111_0000_0000_0000) >> 12) as u8,
	TAG_CONT_U8 \| ((c & 0b0_0000_0000_1111_1100_0000) >> 6) as u8,
	TAG_CONT_U8 \| ((c & 0b0_0000_0000_0000_0011_1111) ) as u8]
	}

	/// Convert a Rust `&str` to CESU-8 bytes.
	///
	/// ```
	/// use std::borrow::Cow;
	/// use cesu8::to_cesu8;
	///
	/// // This string is valid as UTF-8 or CESU-8, so it doesn't change,
	/// // and we can convert it without allocating memory.
	/// assert_eq!(Cow::Borrowed("aé日".as_bytes()), to_cesu8("aé日"));
	///
	/// // This string is a 4-byte UTF-8 string, which becomes a 6-byte CESU-8
	/// // vector.
	/// assert_eq!(Cow::Borrowed(&[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81]),
	/// to_cesu8("\u{10401}"));
	/// ```
	pub fn to_cesu8(text: &str) -> Cow<[u8]> {
	if is_valid_cesu8(text) {
	Cow::Borrowed(text.as_bytes())
	} else {
	Cow::Owned(to_cesu8_internal(text, Variant::Standard))
	}
	}

	/// Convert a Rust `&str` to Java's modified UTF-8 bytes.
	///
	/// ```
	/// use std::borrow::Cow;
	/// use cesu8::to_java_cesu8;
	///
	/// // This string is valid as UTF-8 or CESU-8, so it doesn't change,
	/// // and we can convert it without allocating memory.
	/// assert_eq!(Cow::Borrowed("aé日".as_bytes()), to_java_cesu8("aé日"));
	///
	/// // This string is a 4-byte UTF-8 string, which becomes a 6-byte modified
	/// // UTF-8 vector.
	/// assert_eq!(Cow::Borrowed(&[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81]),
	/// to_java_cesu8("\u{10401}"));
	///
	/// // This string contains null, which becomes 2-byte modified UTF-8 encoding
	/// assert_eq!(Cow::Borrowed(&[0xC0, 0x80, 0xC0, 0x80]),
	/// to_java_cesu8("\0\0"));
	/// ```
	pub fn to_java_cesu8(text: &str) -> Cow<[u8]> {
	if is_valid_java_cesu8(text) {
	Cow::Borrowed(text.as_bytes())
	} else {
	Cow::Owned(to_cesu8_internal(text, Variant::Java))
	}
	}

	fn to_cesu8_internal(text: &str, variant: Variant) -> Vec<u8> {
	let bytes = text.as_bytes();
	let mut encoded = Vec::with_capacity(bytes.len() + bytes.len() >> 2);
	let mut i = 0;
	while i < bytes.len() {
	let b = bytes[i];
	if variant == Variant::Java && b == 0 {
	encoded.push(0xc0);
	encoded.push(0x80);
	i += 1;
	} else if b < 128 {
	// Pass ASCII through quickly.
	encoded.push(b);
	i += 1;
	} else {
	// Figure out how many bytes we need for this character.
	let w = utf8_char_width(b);
	assert!(w <= 4);
	assert!(i + w <= bytes.len());
	if w != 4 {
	// Pass through short UTF-8 sequences unmodified.
	encoded.extend(bytes[i..i+w].iter().cloned());
	} else {
	// Encode 4-byte sequences as 6 bytes.
	let s = unsafe { from_utf8_unchecked(&bytes[i..i+w]) };
	let c = s.chars().next().unwrap() as u32 - 0x10000;
	let mut s: [u16; 2] = [0; 2];
	s[0] = ((c >> 10) as u16) \| 0xD800;
	s[1] = ((c & 0x3FF) as u16) \| 0xDC00;
	encoded.extend(enc_surrogate(s[0]).iter().cloned());
	encoded.extend(enc_surrogate(s[1]).iter().cloned());
	}
	i += w;
	}
	}
	encoded
	}

	/// Check whether a Rust string contains valid CESU-8 data.
	pub fn is_valid_cesu8(text: &str) -> bool {
	// We rely on the fact that Rust strings are guaranteed to be valid
	// UTF-8.
	for b in text.bytes() {
	if (b & !CONT_MASK) == TAG_CONT_U8 { continue; }
	if utf8_char_width(b) > 3 { return false; }
	}
	true
	}

	/// Check whether a Rust string contains valid Java's modified UTF-8 data.
	pub fn is_valid_java_cesu8(text: &str) -> bool {
	!text.contains('\0') && is_valid_cesu8(text)
	}

	#[test]
	fn test_valid_cesu8() {
	assert!(is_valid_cesu8("aé日"));
	assert!(is_valid_java_cesu8("aé日"));
	assert!(!is_valid_cesu8("\u{10401}"));
	assert!(!is_valid_java_cesu8("\u{10401}"));
	assert!(is_valid_cesu8("\0\0"));
	assert!(!is_valid_java_cesu8("\0\0"));
	}


	/// Encode a single surrogate as CESU-8.
	fn enc_surrogate(surrogate: u16) -> [u8; 3] {
	assert!(0xD800 <= surrogate && surrogate <= 0xDFFF);
	// 1110xxxx 10xxxxxx 10xxxxxx
	[0b11100000 \| ((surrogate & 0b11110000_00000000) >> 12) as u8,
	TAG_CONT_U8 \| ((surrogate & 0b00001111_11000000) >> 6) as u8,
	TAG_CONT_U8 \| ((surrogate & 0b00000000_00111111) ) as u8]
	}