Hal Canary | f107a2f | 2018-07-25 16:52:48 -0400 | [diff] [blame] | 1 | // Copyright 2018 Google LLC. |
| 2 | // Use of this source code is governed by a BSD-style license that can be found in the LICENSE file. |
| 3 | |
Chris Dalton | 597c33a | 2020-09-18 13:27:35 -0600 | [diff] [blame] | 4 | #include "include/private/SkTFitsIn.h" |
Mike Klein | c0bd9f9 | 2019-04-23 12:05:21 -0500 | [diff] [blame] | 5 | #include "src/utils/SkUTF.h" |
Hal Canary | f107a2f | 2018-07-25 16:52:48 -0400 | [diff] [blame] | 6 | |
| 7 | #include <climits> |
| 8 | |
| 9 | static constexpr inline int32_t left_shift(int32_t value, int32_t shift) { |
| 10 | return (int32_t) ((uint32_t) value << shift); |
| 11 | } |
| 12 | |
| 13 | template <typename T> static constexpr bool is_align2(T x) { return 0 == (x & 1); } |
| 14 | |
| 15 | template <typename T> static constexpr bool is_align4(T x) { return 0 == (x & 3); } |
| 16 | |
| 17 | static constexpr inline bool utf16_is_high_surrogate(uint16_t c) { return (c & 0xFC00) == 0xD800; } |
| 18 | |
| 19 | static constexpr inline bool utf16_is_low_surrogate(uint16_t c) { return (c & 0xFC00) == 0xDC00; } |
| 20 | |
| 21 | /** @returns -1 iff invalid UTF8 byte, |
| 22 | 0 iff UTF8 continuation byte, |
| 23 | 1 iff ASCII byte, |
| 24 | 2 iff leading byte of 2-byte sequence, |
| 25 | 3 iff leading byte of 3-byte sequence, and |
| 26 | 4 iff leading byte of 4-byte sequence. |
| 27 | I.e.: if return value > 0, then gives length of sequence. |
| 28 | */ |
| 29 | static int utf8_byte_type(uint8_t c) { |
| 30 | if (c < 0x80) { |
| 31 | return 1; |
| 32 | } else if (c < 0xC0) { |
| 33 | return 0; |
| 34 | } else if (c >= 0xF5 || (c & 0xFE) == 0xC0) { // "octet values c0, c1, f5 to ff never appear" |
| 35 | return -1; |
| 36 | } else { |
| 37 | int value = (((0xe5 << 24) >> ((unsigned)c >> 4 << 1)) & 3) + 1; |
| 38 | // assert(value >= 2 && value <=4); |
| 39 | return value; |
| 40 | } |
| 41 | } |
| 42 | static bool utf8_type_is_valid_leading_byte(int type) { return type > 0; } |
| 43 | |
| 44 | static bool utf8_byte_is_continuation(uint8_t c) { return utf8_byte_type(c) == 0; } |
| 45 | |
| 46 | //////////////////////////////////////////////////////////////////////////////// |
| 47 | |
| 48 | int SkUTF::CountUTF8(const char* utf8, size_t byteLength) { |
| 49 | if (!utf8) { |
| 50 | return -1; |
| 51 | } |
| 52 | int count = 0; |
| 53 | const char* stop = utf8 + byteLength; |
| 54 | while (utf8 < stop) { |
| 55 | int type = utf8_byte_type(*(const uint8_t*)utf8); |
| 56 | if (!utf8_type_is_valid_leading_byte(type) || utf8 + type > stop) { |
| 57 | return -1; // Sequence extends beyond end. |
| 58 | } |
| 59 | while(type-- > 1) { |
| 60 | ++utf8; |
| 61 | if (!utf8_byte_is_continuation(*(const uint8_t*)utf8)) { |
| 62 | return -1; |
| 63 | } |
| 64 | } |
| 65 | ++utf8; |
| 66 | ++count; |
| 67 | } |
| 68 | return count; |
| 69 | } |
| 70 | |
| 71 | int SkUTF::CountUTF16(const uint16_t* utf16, size_t byteLength) { |
| 72 | if (!utf16 || !is_align2(intptr_t(utf16)) || !is_align2(byteLength)) { |
| 73 | return -1; |
| 74 | } |
| 75 | const uint16_t* src = (const uint16_t*)utf16; |
| 76 | const uint16_t* stop = src + (byteLength >> 1); |
| 77 | int count = 0; |
| 78 | while (src < stop) { |
| 79 | unsigned c = *src++; |
| 80 | if (utf16_is_low_surrogate(c)) { |
| 81 | return -1; |
| 82 | } |
| 83 | if (utf16_is_high_surrogate(c)) { |
| 84 | if (src >= stop) { |
| 85 | return -1; |
| 86 | } |
| 87 | c = *src++; |
| 88 | if (!utf16_is_low_surrogate(c)) { |
| 89 | return -1; |
| 90 | } |
| 91 | } |
| 92 | count += 1; |
| 93 | } |
| 94 | return count; |
| 95 | } |
| 96 | |
| 97 | int SkUTF::CountUTF32(const int32_t* utf32, size_t byteLength) { |
Chris Dalton | 597c33a | 2020-09-18 13:27:35 -0600 | [diff] [blame] | 98 | if (!is_align4(intptr_t(utf32)) || !is_align4(byteLength) || !SkTFitsIn<int>(byteLength >> 2)) { |
Hal Canary | f107a2f | 2018-07-25 16:52:48 -0400 | [diff] [blame] | 99 | return -1; |
| 100 | } |
| 101 | const uint32_t kInvalidUnicharMask = 0xFF000000; // unichar fits in 24 bits |
| 102 | const uint32_t* ptr = (const uint32_t*)utf32; |
| 103 | const uint32_t* stop = ptr + (byteLength >> 2); |
| 104 | while (ptr < stop) { |
| 105 | if (*ptr & kInvalidUnicharMask) { |
| 106 | return -1; |
| 107 | } |
| 108 | ptr += 1; |
| 109 | } |
| 110 | return (int)(byteLength >> 2); |
| 111 | } |
| 112 | |
| 113 | template <typename T> |
| 114 | static SkUnichar next_fail(const T** ptr, const T* end) { |
| 115 | *ptr = end; |
| 116 | return -1; |
| 117 | } |
| 118 | |
| 119 | SkUnichar SkUTF::NextUTF8(const char** ptr, const char* end) { |
| 120 | if (!ptr || !end ) { |
| 121 | return -1; |
| 122 | } |
| 123 | const uint8_t* p = (const uint8_t*)*ptr; |
| 124 | if (!p || p >= (const uint8_t*)end) { |
| 125 | return next_fail(ptr, end); |
| 126 | } |
| 127 | int c = *p; |
| 128 | int hic = c << 24; |
| 129 | |
| 130 | if (!utf8_type_is_valid_leading_byte(utf8_byte_type(c))) { |
| 131 | return next_fail(ptr, end); |
| 132 | } |
| 133 | if (hic < 0) { |
| 134 | uint32_t mask = (uint32_t)~0x3F; |
| 135 | hic = left_shift(hic, 1); |
| 136 | do { |
| 137 | ++p; |
| 138 | if (p >= (const uint8_t*)end) { |
| 139 | return next_fail(ptr, end); |
| 140 | } |
| 141 | // check before reading off end of array. |
| 142 | uint8_t nextByte = *p; |
| 143 | if (!utf8_byte_is_continuation(nextByte)) { |
| 144 | return next_fail(ptr, end); |
| 145 | } |
| 146 | c = (c << 6) | (nextByte & 0x3F); |
| 147 | mask <<= 5; |
| 148 | } while ((hic = left_shift(hic, 1)) < 0); |
| 149 | c &= ~mask; |
| 150 | } |
| 151 | *ptr = (char*)p + 1; |
| 152 | return c; |
| 153 | } |
| 154 | |
| 155 | SkUnichar SkUTF::NextUTF16(const uint16_t** ptr, const uint16_t* end) { |
| 156 | if (!ptr || !end ) { |
Greg Kaiser | be03a851 | 2019-02-11 11:20:39 -0800 | [diff] [blame] | 157 | return -1; |
Hal Canary | f107a2f | 2018-07-25 16:52:48 -0400 | [diff] [blame] | 158 | } |
| 159 | const uint16_t* src = *ptr; |
| 160 | if (!src || src + 1 > end || !is_align2(intptr_t(src))) { |
| 161 | return next_fail(ptr, end); |
| 162 | } |
| 163 | uint16_t c = *src++; |
| 164 | SkUnichar result = c; |
| 165 | if (utf16_is_low_surrogate(c)) { |
| 166 | return next_fail(ptr, end); // srcPtr should never point at low surrogate. |
| 167 | } |
| 168 | if (utf16_is_high_surrogate(c)) { |
| 169 | if (src + 1 > end) { |
| 170 | return next_fail(ptr, end); // Truncated string. |
| 171 | } |
| 172 | uint16_t low = *src++; |
| 173 | if (!utf16_is_low_surrogate(low)) { |
| 174 | return next_fail(ptr, end); |
| 175 | } |
| 176 | /* |
| 177 | [paraphrased from wikipedia] |
| 178 | Take the high surrogate and subtract 0xD800, then multiply by 0x400. |
| 179 | Take the low surrogate and subtract 0xDC00. Add these two results |
| 180 | together, and finally add 0x10000 to get the final decoded codepoint. |
| 181 | |
| 182 | unicode = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000 |
| 183 | unicode = (high * 0x400) - (0xD800 * 0x400) + low - 0xDC00 + 0x10000 |
| 184 | unicode = (high << 10) - (0xD800 << 10) + low - 0xDC00 + 0x10000 |
| 185 | unicode = (high << 10) + low - ((0xD800 << 10) + 0xDC00 - 0x10000) |
| 186 | */ |
| 187 | result = (result << 10) + (SkUnichar)low - ((0xD800 << 10) + 0xDC00 - 0x10000); |
| 188 | } |
| 189 | *ptr = src; |
| 190 | return result; |
| 191 | } |
| 192 | |
| 193 | SkUnichar SkUTF::NextUTF32(const int32_t** ptr, const int32_t* end) { |
| 194 | if (!ptr || !end ) { |
| 195 | return -1; |
| 196 | } |
| 197 | const int32_t* s = *ptr; |
| 198 | if (!s || s + 1 > end || !is_align4(intptr_t(s))) { |
| 199 | return next_fail(ptr, end); |
| 200 | } |
| 201 | int32_t value = *s; |
| 202 | const uint32_t kInvalidUnicharMask = 0xFF000000; // unichar fits in 24 bits |
| 203 | if (value & kInvalidUnicharMask) { |
| 204 | return next_fail(ptr, end); |
| 205 | } |
| 206 | *ptr = s + 1; |
| 207 | return value; |
| 208 | } |
| 209 | |
| 210 | size_t SkUTF::ToUTF8(SkUnichar uni, char utf8[SkUTF::kMaxBytesInUTF8Sequence]) { |
| 211 | if ((uint32_t)uni > 0x10FFFF) { |
| 212 | return 0; |
| 213 | } |
| 214 | if (uni <= 127) { |
| 215 | if (utf8) { |
| 216 | *utf8 = (char)uni; |
| 217 | } |
| 218 | return 1; |
| 219 | } |
| 220 | char tmp[4]; |
| 221 | char* p = tmp; |
| 222 | size_t count = 1; |
| 223 | while (uni > 0x7F >> count) { |
| 224 | *p++ = (char)(0x80 | (uni & 0x3F)); |
| 225 | uni >>= 6; |
| 226 | count += 1; |
| 227 | } |
| 228 | if (utf8) { |
| 229 | p = tmp; |
| 230 | utf8 += count; |
| 231 | while (p < tmp + count - 1) { |
| 232 | *--utf8 = *p++; |
| 233 | } |
| 234 | *--utf8 = (char)(~(0xFF >> count) | uni); |
| 235 | } |
| 236 | return count; |
| 237 | } |
| 238 | |
| 239 | size_t SkUTF::ToUTF16(SkUnichar uni, uint16_t utf16[2]) { |
| 240 | if ((uint32_t)uni > 0x10FFFF) { |
| 241 | return 0; |
| 242 | } |
| 243 | int extra = (uni > 0xFFFF); |
| 244 | if (utf16) { |
| 245 | if (extra) { |
| 246 | utf16[0] = (uint16_t)((0xD800 - 64) + (uni >> 10)); |
| 247 | utf16[1] = (uint16_t)(0xDC00 | (uni & 0x3FF)); |
| 248 | } else { |
| 249 | utf16[0] = (uint16_t)uni; |
| 250 | } |
| 251 | } |
| 252 | return 1 + extra; |
| 253 | } |
| 254 | |
Julia Lavrova | 90787fe | 2020-07-20 17:32:03 +0000 | [diff] [blame] | 255 | int SkUTF::UTF8ToUTF16(uint16_t dst[], int dstCapacity, const char src[], size_t srcByteLength) { |
| 256 | if (!dst) { |
| 257 | dstCapacity = 0; |
| 258 | } |
| 259 | |
| 260 | int dstLength = 0; |
| 261 | uint16_t* endDst = dst + dstCapacity; |
| 262 | const char* endSrc = src + srcByteLength; |
| 263 | while (src < endSrc) { |
| 264 | SkUnichar uni = NextUTF8(&src, endSrc); |
| 265 | if (uni < 0) { |
| 266 | return -1; |
| 267 | } |
| 268 | |
| 269 | uint16_t utf16[2]; |
| 270 | size_t count = ToUTF16(uni, utf16); |
| 271 | if (count == 0) { |
| 272 | return -1; |
| 273 | } |
| 274 | dstLength += count; |
| 275 | |
| 276 | if (dst) { |
| 277 | uint16_t* elems = utf16; |
| 278 | while (dst < endDst && count > 0) { |
| 279 | *dst++ = *elems++; |
| 280 | count -= 1; |
| 281 | } |
| 282 | } |
| 283 | } |
| 284 | return dstLength; |
| 285 | } |
| 286 | |
Julia Lavrova | b6b7fff | 2020-09-11 13:59:49 +0000 | [diff] [blame] | 287 | int SkUTF::UTF16ToUTF8(char dst[], int dstCapacity, const uint16_t src[], size_t srcLength) { |
Julia Lavrova | b6b7fff | 2020-09-11 13:59:49 +0000 | [diff] [blame] | 288 | if (!dst) { |
| 289 | dstCapacity = 0; |
| 290 | } |
| 291 | |
| 292 | int dstLength = 0; |
| 293 | const char* endDst = dst + dstCapacity; |
Jason Simmons | 3b88c07 | 2020-09-23 16:00:46 -0700 | [diff] [blame] | 294 | const uint16_t* endSrc = src + srcLength; |
| 295 | while (src < endSrc) { |
| 296 | SkUnichar uni = NextUTF16(&src, endSrc); |
| 297 | if (uni < 0) { |
| 298 | return -1; |
| 299 | } |
| 300 | |
Julia Lavrova | b6b7fff | 2020-09-11 13:59:49 +0000 | [diff] [blame] | 301 | char utf8[SkUTF::kMaxBytesInUTF8Sequence]; |
Jason Simmons | 3b88c07 | 2020-09-23 16:00:46 -0700 | [diff] [blame] | 302 | size_t count = ToUTF8(uni, utf8); |
Julia Lavrova | b6b7fff | 2020-09-11 13:59:49 +0000 | [diff] [blame] | 303 | if (count == 0) { |
| 304 | return -1; |
| 305 | } |
| 306 | dstLength += count; |
Jason Simmons | 3b88c07 | 2020-09-23 16:00:46 -0700 | [diff] [blame] | 307 | |
Julia Lavrova | b6b7fff | 2020-09-11 13:59:49 +0000 | [diff] [blame] | 308 | if (dst) { |
| 309 | const char* elems = utf8; |
| 310 | while (dst < endDst && count > 0) { |
| 311 | *dst++ = *elems++; |
| 312 | count -= 1; |
| 313 | } |
| 314 | } |
| 315 | } |
| 316 | return dstLength; |
| 317 | } |