Ivan Lozano | a7e4bc0 | 2021-08-20 09:59:16 -0400 | [diff] [blame] | 1 | // Copyright 2012-2014 The Rust Project Developers and Eric Kidd. See the |
| 2 | // COPYRIGHT-RUST.txt file at the top-level directory of this distribution. |
| 3 | // |
| 4 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| 5 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| 6 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| 7 | // option. This file may not be copied, modified, or distributed except |
| 8 | // according to those terms. |
| 9 | |
| 10 | |
| 11 | //! A simple library implementing the [CESU-8 compatibility encoding |
| 12 | //! scheme](http://www.unicode.org/reports/tr26/tr26-2.html). This is a |
| 13 | //! non-standard variant of UTF-8 that is used internally by some systems |
| 14 | //! that need to represent UTF-16 data as 8-bit characters. Yes, this is |
| 15 | //! ugly. |
| 16 | //! |
| 17 | //! Use of this encoding is discouraged by the Unicode Consortium. It's OK |
| 18 | //! for working with existing internal APIs, but it should not be used for |
| 19 | //! transmitting or storing data. |
| 20 | //! |
| 21 | //! ``` |
| 22 | //! use std::borrow::Cow; |
| 23 | //! use cesu8::{from_cesu8, to_cesu8}; |
| 24 | //! |
| 25 | //! // 16-bit Unicode characters are the same in UTF-8 and CESU-8. |
| 26 | //! assert_eq!(Cow::Borrowed("aé日".as_bytes()), |
| 27 | //! to_cesu8("aé日")); |
| 28 | //! assert_eq!(Cow::Borrowed("aé日"), |
| 29 | //! from_cesu8("aé日".as_bytes()).unwrap()); |
| 30 | //! |
| 31 | //! // This string is CESU-8 data containing a 6-byte surrogate pair, |
| 32 | //! // which decodes to a 4-byte UTF-8 string. |
| 33 | //! let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81]; |
| 34 | //! assert_eq!(Cow::Borrowed("\u{10401}"), |
| 35 | //! from_cesu8(data).unwrap()); |
| 36 | //! ``` |
| 37 | //! |
| 38 | //! ### A note about security |
| 39 | //! |
| 40 | //! As a general rule, this library is intended to fail on malformed or |
| 41 | //! unexpected input. CESU-8 is supposed to be an internal-only format, |
| 42 | //! and if we're seeing malformed data, we assume that it's either a bug in |
| 43 | //! somebody's code, or an attacker is trying to improperly encode data to |
| 44 | //! evade security checks. |
| 45 | //! |
| 46 | //! If you have a use case for lossy conversion to UTF-8, or conversion |
| 47 | //! from mixed UTF-8/CESU-8 data, please feel free to submit a pull request |
| 48 | //! for `from_cesu8_lossy_permissive` with appropriate behavior. |
| 49 | //! |
| 50 | //! ### Java and U+0000, and other variants |
| 51 | //! |
| 52 | //! Java uses the CESU-8 encoding as described above, but with one |
| 53 | //! difference: The null character U+0000 is represented as an overlong |
| 54 | //! UTF-8 sequence `C0 80`. This is supported by the `from_java_cesu8` and |
| 55 | //! `to_java_cesu8` methods. |
| 56 | //! |
| 57 | //! ### Surrogate pairs and UTF-8 |
| 58 | //! |
| 59 | //! The UTF-16 encoding uses "surrogate pairs" to represent Unicode code |
| 60 | //! points in the range from U+10000 to U+10FFFF. These are 16-bit numbers |
| 61 | //! in the range 0xD800 to 0xDFFF. |
| 62 | //! |
| 63 | //! * 0xD800 to 0xDBFF: First half of surrogate pair. When encoded as |
| 64 | //! CESU-8, these become **1110**1101 **10**100000 **10**000000 to |
| 65 | //! **1110**1101 **10**101111 **10**111111. |
| 66 | //! |
| 67 | //! * 0xDC00 to 0xDFFF: Second half of surrogate pair. These become |
| 68 | //! **1110**1101 **10**110000 **10**000000 to |
| 69 | //! **1110**1101 **10**111111 **10**111111. |
| 70 | //! |
| 71 | //! Wikipedia [explains](http://en.wikipedia.org/wiki/UTF-16) the |
| 72 | //! code point to UTF-16 conversion process: |
| 73 | //! |
| 74 | //! > Consider the encoding of U+10437 (𐐷): |
| 75 | //! > |
| 76 | //! > * Subtract 0x10000 from 0x10437. The result is 0x00437, 0000 0000 0100 |
| 77 | //! > 0011 0111. |
| 78 | //! > * Split this into the high 10-bit value and the low 10-bit value: |
| 79 | //! > 0000000001 and 0000110111. |
| 80 | //! > * Add 0xD800 to the high value to form the high surrogate: 0xD800 + |
| 81 | //! > 0x0001 = 0xD801. |
| 82 | //! > * Add 0xDC00 to the low value to form the low surrogate: 0xDC00 + |
| 83 | //! > 0x0037 = 0xDC37. |
| 84 | |
| 85 | #![warn(missing_docs)] |
| 86 | |
| 87 | |
| 88 | use std::borrow::Cow; |
| 89 | use std::error::Error; |
| 90 | use std::fmt; |
| 91 | use std::result::Result; |
| 92 | use std::slice; |
| 93 | use std::str::{from_utf8, from_utf8_unchecked}; |
| 94 | use unicode::utf8_char_width; |
| 95 | |
| 96 | mod unicode; |
| 97 | |
| 98 | /// Mask of the value bits of a continuation byte. |
| 99 | const CONT_MASK: u8 = 0b0011_1111u8; |
| 100 | /// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte. |
| 101 | const TAG_CONT_U8: u8 = 0b1000_0000u8; |
| 102 | |
| 103 | /// The CESU-8 data could not be decoded as valid UTF-8 data. |
| 104 | #[derive(Clone, Copy, Debug)] |
| 105 | pub struct Cesu8DecodingError; |
| 106 | |
| 107 | impl Error for Cesu8DecodingError { |
| 108 | fn description(&self) -> &str { "decoding error" } |
| 109 | fn cause(&self) -> Option<&Error> { None } |
| 110 | } |
| 111 | |
| 112 | impl fmt::Display for Cesu8DecodingError { |
| 113 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| 114 | write!(f, "could not convert CESU-8 data to UTF-8") |
| 115 | } |
| 116 | } |
| 117 | |
| 118 | /// Which variant of the encoding are we working with? |
| 119 | #[derive(PartialEq, Eq)] |
| 120 | enum Variant { |
| 121 | /// Regular CESU-8, with '\0' represented by itself. |
| 122 | Standard, |
| 123 | /// This is technically Java's "Modified UTF-8", which is supposedly |
| 124 | /// like CESU-8, except that it UTF-8 encodes the '\0' byte. I'm sure |
| 125 | /// it seemed like a good idea at the time. |
| 126 | Java, |
| 127 | } |
| 128 | |
| 129 | /// Convert CESU-8 data to a Rust string, re-encoding only if necessary. |
| 130 | /// Returns an error if the data cannot be represented as valid UTF-8. |
| 131 | /// |
| 132 | /// ``` |
| 133 | /// use std::borrow::Cow; |
| 134 | /// use cesu8::from_cesu8; |
| 135 | /// |
| 136 | /// // This string is valid as UTF-8 or CESU-8, so it doesn't change, |
| 137 | /// // and we can convert it without allocating memory. |
| 138 | /// assert_eq!(Cow::Borrowed("aé日"), |
| 139 | /// from_cesu8("aé日".as_bytes()).unwrap()); |
| 140 | /// |
| 141 | /// // This string is CESU-8 data containing a 6-byte surrogate pair, |
| 142 | /// // which becomes a 4-byte UTF-8 string. |
| 143 | /// let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81]; |
| 144 | /// assert_eq!(Cow::Borrowed("\u{10401}"), |
| 145 | /// from_cesu8(data).unwrap()); |
| 146 | /// ``` |
| 147 | pub fn from_cesu8(bytes: &[u8]) -> Result<Cow<str>, Cesu8DecodingError> { |
| 148 | from_cesu8_internal(bytes, Variant::Standard) |
| 149 | } |
| 150 | |
| 151 | /// Convert Java's modified UTF-8 data to a Rust string, re-encoding only if |
| 152 | /// necessary. Returns an error if the data cannot be represented as valid |
| 153 | /// UTF-8. |
| 154 | /// |
| 155 | /// ``` |
| 156 | /// use std::borrow::Cow; |
| 157 | /// use cesu8::from_java_cesu8; |
| 158 | /// |
| 159 | /// // This string is valid as UTF-8 or modified UTF-8, so it doesn't change, |
| 160 | /// // and we can convert it without allocating memory. |
| 161 | /// assert_eq!(Cow::Borrowed("aé日"), |
| 162 | /// from_java_cesu8("aé日".as_bytes()).unwrap()); |
| 163 | /// |
| 164 | /// // This string is modified UTF-8 data containing a 6-byte surrogate pair, |
| 165 | /// // which becomes a 4-byte UTF-8 string. |
| 166 | /// let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81]; |
| 167 | /// assert_eq!(Cow::Borrowed("\u{10401}"), |
| 168 | /// from_java_cesu8(data).unwrap()); |
| 169 | /// |
| 170 | /// // This string is modified UTF-8 data containing null code-points. |
| 171 | /// let data = &[0xC0, 0x80, 0xC0, 0x80]; |
| 172 | /// assert_eq!(Cow::Borrowed("\0\0"), |
| 173 | /// from_java_cesu8(data).unwrap()); |
| 174 | /// ``` |
| 175 | pub fn from_java_cesu8(bytes: &[u8]) -> Result<Cow<str>, Cesu8DecodingError> { |
| 176 | from_cesu8_internal(bytes, Variant::Java) |
| 177 | } |
| 178 | |
| 179 | /// Do the actual work of decoding. |
| 180 | fn from_cesu8_internal(bytes: &[u8], variant: Variant) -> |
| 181 | Result<Cow<str>, Cesu8DecodingError> |
| 182 | { |
| 183 | match from_utf8(bytes) { |
| 184 | Ok(str) => Ok(Cow::Borrowed(str)), |
| 185 | _ => { |
| 186 | let mut decoded = Vec::with_capacity(bytes.len()); |
| 187 | if decode_from_iter(&mut decoded, &mut bytes.iter(), variant) { |
| 188 | // Keep this assertion in debug mode only. It's important |
| 189 | // that this assertion is true, because Rust assumes that |
| 190 | // all UTF-8 strings are valid. |
| 191 | debug_assert!(from_utf8(&decoded[..]).is_ok()); |
| 192 | Ok(Cow::Owned(unsafe { String::from_utf8_unchecked(decoded) })) |
| 193 | } else { |
| 194 | Err(Cesu8DecodingError) |
| 195 | } |
| 196 | } |
| 197 | } |
| 198 | } |
| 199 | |
| 200 | #[test] |
| 201 | fn test_from_cesu8() { |
| 202 | // The surrogate-encoded character below is from the ICU library's |
| 203 | // icu/source/test/testdata/conversion.txt test case. |
| 204 | let data = &[0x4D, 0xE6, 0x97, 0xA5, 0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81, 0x7F]; |
| 205 | assert_eq!(Cow::Borrowed("M日\u{10401}\u{7F}"), |
| 206 | from_cesu8(data).unwrap()); |
| 207 | |
| 208 | // We used to have test data from the CESU-8 specification, but when we |
| 209 | // worked it through manually, we got the wrong answer: |
| 210 | // |
| 211 | // Input: [0xED, 0xAE, 0x80, 0xED, 0xB0, 0x80] |
| 212 | // Binary: 11101101 10101110 10000000 11101101 10110000 10000000 |
| 213 | // |
| 214 | // 0b1101_101110_000000 -> 0xDB80 |
| 215 | // 0b1101_110000_000000 -> 0xDC00 |
| 216 | // |
| 217 | // ((0xDB80 - 0xD800) << 10) | (0xDC00 - 0xDC00) -> 0xE0000 |
| 218 | // 0x10000 + 0xE0000 -> 0xF0000 |
| 219 | // |
| 220 | // The spec claims that we are supposed to get 0x10000, not 0xF0000. |
| 221 | // Since I can't reconcile this example data with the text of the |
| 222 | // specification, I decided to use a test character from ICU instead. |
| 223 | } |
| 224 | |
| 225 | // Our internal decoder, based on Rust's is_utf8 implementation. |
| 226 | fn decode_from_iter( |
| 227 | decoded: &mut Vec<u8>, iter: &mut slice::Iter<u8>, variant: Variant) |
| 228 | -> bool |
| 229 | { |
| 230 | macro_rules! err { |
| 231 | () => { return false } |
| 232 | } |
| 233 | macro_rules! next { |
| 234 | () => { |
| 235 | match iter.next() { |
| 236 | Some(a) => *a, |
| 237 | // We needed data, but there was none: error! |
| 238 | None => err!() |
| 239 | } |
| 240 | } |
| 241 | } |
| 242 | macro_rules! next_cont { |
| 243 | () => { |
| 244 | { |
| 245 | let byte = next!(); |
| 246 | if (byte) & !CONT_MASK == TAG_CONT_U8 { byte } else { err!() } |
| 247 | } |
| 248 | } |
| 249 | } |
| 250 | |
| 251 | loop { |
| 252 | let first = match iter.next() { |
| 253 | Some(&b) => b, |
| 254 | // We're at the end of the iterator and a codepoint boundary at |
| 255 | // the same time, so this string is valid. |
| 256 | None => return true |
| 257 | }; |
| 258 | |
| 259 | if variant == Variant::Java && first == 0 { |
| 260 | // Java's modified UTF-8 should never contain \0 directly. |
| 261 | err!(); |
| 262 | } else if first < 128 { |
| 263 | // Pass ASCII through directly. |
| 264 | decoded.push(first); |
| 265 | } else if first == 0xc0 && variant == Variant::Java { |
| 266 | match next!() { |
| 267 | 0x80 => decoded.push(0), |
| 268 | _ => err!(), |
| 269 | } |
| 270 | } else { |
| 271 | let w = utf8_char_width(first); |
| 272 | let second = next_cont!(); |
| 273 | match w { |
| 274 | // Two-byte sequences can be used directly. |
| 275 | 2 => { decoded.extend([first, second].iter().cloned()); } |
| 276 | 3 => { |
| 277 | let third = next_cont!(); |
| 278 | match (first, second) { |
| 279 | // These are valid UTF-8, so pass them through. |
| 280 | (0xE0 , 0xA0 ... 0xBF) | |
| 281 | (0xE1 ... 0xEC, 0x80 ... 0xBF) | |
| 282 | (0xED , 0x80 ... 0x9F) | |
| 283 | (0xEE ... 0xEF, 0x80 ... 0xBF) => { |
| 284 | decoded.extend([first, second, third].iter() |
| 285 | .cloned()) |
| 286 | } |
| 287 | // First half a surrogate pair, so decode. |
| 288 | (0xED , 0xA0 ... 0xAF) => { |
| 289 | if next!() != 0xED { err!() } |
| 290 | let fifth = next_cont!(); |
| 291 | if fifth < 0xB0 || 0xBF < fifth { err!() } |
| 292 | let sixth = next_cont!(); |
| 293 | let s = dec_surrogates(second, third, fifth, sixth); |
| 294 | decoded.extend(s.iter().cloned()); |
| 295 | } |
| 296 | _ => err!() |
| 297 | } |
| 298 | } |
| 299 | _ => err!() |
| 300 | } |
| 301 | } |
| 302 | } |
| 303 | } |
| 304 | |
| 305 | /// Convert the two trailing bytes from a CESU-8 surrogate to a regular |
| 306 | /// surrogate value. |
| 307 | fn dec_surrogate(second: u8, third: u8) -> u32 { |
| 308 | 0xD000u32 | ((second & CONT_MASK) as u32) << 6 | (third & CONT_MASK) as u32 |
| 309 | } |
| 310 | |
| 311 | /// Convert the bytes from a CESU-8 surrogate pair into a valid UTF-8 |
| 312 | /// sequence. Assumes input is valid. |
| 313 | fn dec_surrogates(second: u8, third: u8, fifth: u8, sixth: u8) -> [u8; 4] { |
| 314 | // Convert to a 32-bit code point. |
| 315 | let s1 = dec_surrogate(second, third); |
| 316 | let s2 = dec_surrogate(fifth, sixth); |
| 317 | let c = 0x10000 + (((s1 - 0xD800) << 10) | (s2 - 0xDC00)); |
| 318 | //println!("{:0>8b} {:0>8b} {:0>8b} -> {:0>16b}", 0xEDu8, second, third, s1); |
| 319 | //println!("{:0>8b} {:0>8b} {:0>8b} -> {:0>16b}", 0xEDu8, fifth, sixth, s2); |
| 320 | //println!("-> {:0>32b}", c); |
| 321 | assert!(0x010000 <= c && c <= 0x10FFFF); |
| 322 | |
| 323 | // Convert to UTF-8. |
| 324 | // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
| 325 | [0b1111_0000u8 | ((c & 0b1_1100_0000_0000_0000_0000) >> 18) as u8, |
| 326 | TAG_CONT_U8 | ((c & 0b0_0011_1111_0000_0000_0000) >> 12) as u8, |
| 327 | TAG_CONT_U8 | ((c & 0b0_0000_0000_1111_1100_0000) >> 6) as u8, |
| 328 | TAG_CONT_U8 | ((c & 0b0_0000_0000_0000_0011_1111) ) as u8] |
| 329 | } |
| 330 | |
| 331 | /// Convert a Rust `&str` to CESU-8 bytes. |
| 332 | /// |
| 333 | /// ``` |
| 334 | /// use std::borrow::Cow; |
| 335 | /// use cesu8::to_cesu8; |
| 336 | /// |
| 337 | /// // This string is valid as UTF-8 or CESU-8, so it doesn't change, |
| 338 | /// // and we can convert it without allocating memory. |
| 339 | /// assert_eq!(Cow::Borrowed("aé日".as_bytes()), to_cesu8("aé日")); |
| 340 | /// |
| 341 | /// // This string is a 4-byte UTF-8 string, which becomes a 6-byte CESU-8 |
| 342 | /// // vector. |
| 343 | /// assert_eq!(Cow::Borrowed(&[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81]), |
| 344 | /// to_cesu8("\u{10401}")); |
| 345 | /// ``` |
| 346 | pub fn to_cesu8(text: &str) -> Cow<[u8]> { |
| 347 | if is_valid_cesu8(text) { |
| 348 | Cow::Borrowed(text.as_bytes()) |
| 349 | } else { |
| 350 | Cow::Owned(to_cesu8_internal(text, Variant::Standard)) |
| 351 | } |
| 352 | } |
| 353 | |
| 354 | /// Convert a Rust `&str` to Java's modified UTF-8 bytes. |
| 355 | /// |
| 356 | /// ``` |
| 357 | /// use std::borrow::Cow; |
| 358 | /// use cesu8::to_java_cesu8; |
| 359 | /// |
| 360 | /// // This string is valid as UTF-8 or CESU-8, so it doesn't change, |
| 361 | /// // and we can convert it without allocating memory. |
| 362 | /// assert_eq!(Cow::Borrowed("aé日".as_bytes()), to_java_cesu8("aé日")); |
| 363 | /// |
| 364 | /// // This string is a 4-byte UTF-8 string, which becomes a 6-byte modified |
| 365 | /// // UTF-8 vector. |
| 366 | /// assert_eq!(Cow::Borrowed(&[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81]), |
| 367 | /// to_java_cesu8("\u{10401}")); |
| 368 | /// |
| 369 | /// // This string contains null, which becomes 2-byte modified UTF-8 encoding |
| 370 | /// assert_eq!(Cow::Borrowed(&[0xC0, 0x80, 0xC0, 0x80]), |
| 371 | /// to_java_cesu8("\0\0")); |
| 372 | /// ``` |
| 373 | pub fn to_java_cesu8(text: &str) -> Cow<[u8]> { |
| 374 | if is_valid_java_cesu8(text) { |
| 375 | Cow::Borrowed(text.as_bytes()) |
| 376 | } else { |
| 377 | Cow::Owned(to_cesu8_internal(text, Variant::Java)) |
| 378 | } |
| 379 | } |
| 380 | |
| 381 | fn to_cesu8_internal(text: &str, variant: Variant) -> Vec<u8> { |
| 382 | let bytes = text.as_bytes(); |
| 383 | let mut encoded = Vec::with_capacity(bytes.len() + bytes.len() >> 2); |
| 384 | let mut i = 0; |
| 385 | while i < bytes.len() { |
| 386 | let b = bytes[i]; |
| 387 | if variant == Variant::Java && b == 0 { |
| 388 | encoded.push(0xc0); |
| 389 | encoded.push(0x80); |
| 390 | i += 1; |
| 391 | } else if b < 128 { |
| 392 | // Pass ASCII through quickly. |
| 393 | encoded.push(b); |
| 394 | i += 1; |
| 395 | } else { |
| 396 | // Figure out how many bytes we need for this character. |
| 397 | let w = utf8_char_width(b); |
| 398 | assert!(w <= 4); |
| 399 | assert!(i + w <= bytes.len()); |
| 400 | if w != 4 { |
| 401 | // Pass through short UTF-8 sequences unmodified. |
| 402 | encoded.extend(bytes[i..i+w].iter().cloned()); |
| 403 | } else { |
| 404 | // Encode 4-byte sequences as 6 bytes. |
| 405 | let s = unsafe { from_utf8_unchecked(&bytes[i..i+w]) }; |
| 406 | let c = s.chars().next().unwrap() as u32 - 0x10000; |
| 407 | let mut s: [u16; 2] = [0; 2]; |
| 408 | s[0] = ((c >> 10) as u16) | 0xD800; |
| 409 | s[1] = ((c & 0x3FF) as u16) | 0xDC00; |
| 410 | encoded.extend(enc_surrogate(s[0]).iter().cloned()); |
| 411 | encoded.extend(enc_surrogate(s[1]).iter().cloned()); |
| 412 | } |
| 413 | i += w; |
| 414 | } |
| 415 | } |
| 416 | encoded |
| 417 | } |
| 418 | |
| 419 | /// Check whether a Rust string contains valid CESU-8 data. |
| 420 | pub fn is_valid_cesu8(text: &str) -> bool { |
| 421 | // We rely on the fact that Rust strings are guaranteed to be valid |
| 422 | // UTF-8. |
| 423 | for b in text.bytes() { |
| 424 | if (b & !CONT_MASK) == TAG_CONT_U8 { continue; } |
| 425 | if utf8_char_width(b) > 3 { return false; } |
| 426 | } |
| 427 | true |
| 428 | } |
| 429 | |
| 430 | /// Check whether a Rust string contains valid Java's modified UTF-8 data. |
| 431 | pub fn is_valid_java_cesu8(text: &str) -> bool { |
| 432 | !text.contains('\0') && is_valid_cesu8(text) |
| 433 | } |
| 434 | |
| 435 | #[test] |
| 436 | fn test_valid_cesu8() { |
| 437 | assert!(is_valid_cesu8("aé日")); |
| 438 | assert!(is_valid_java_cesu8("aé日")); |
| 439 | assert!(!is_valid_cesu8("\u{10401}")); |
| 440 | assert!(!is_valid_java_cesu8("\u{10401}")); |
| 441 | assert!(is_valid_cesu8("\0\0")); |
| 442 | assert!(!is_valid_java_cesu8("\0\0")); |
| 443 | } |
| 444 | |
| 445 | |
| 446 | /// Encode a single surrogate as CESU-8. |
| 447 | fn enc_surrogate(surrogate: u16) -> [u8; 3] { |
| 448 | assert!(0xD800 <= surrogate && surrogate <= 0xDFFF); |
| 449 | // 1110xxxx 10xxxxxx 10xxxxxx |
| 450 | [0b11100000 | ((surrogate & 0b11110000_00000000) >> 12) as u8, |
| 451 | TAG_CONT_U8 | ((surrogate & 0b00001111_11000000) >> 6) as u8, |
| 452 | TAG_CONT_U8 | ((surrogate & 0b00000000_00111111) ) as u8] |
| 453 | } |