| /* Copyright 2013 Google Inc. All Rights Reserved. |
| |
| Distributed under MIT license. |
| See file LICENSE for detail or copy at https://opensource.org/licenses/MIT |
| */ |
| |
| /* Heuristics for deciding about the UTF8-ness of strings. */ |
| |
| #include "./utf8_util.h" |
| |
| #include "../common/types.h" |
| |
| #if defined(__cplusplus) || defined(c_plusplus) |
| extern "C" { |
| #endif |
| |
| static size_t BrotliParseAsUTF8( |
| int* symbol, const uint8_t* input, size_t size) { |
| /* ASCII */ |
| if ((input[0] & 0x80) == 0) { |
| *symbol = input[0]; |
| if (*symbol > 0) { |
| return 1; |
| } |
| } |
| /* 2-byte UTF8 */ |
| if (size > 1u && |
| (input[0] & 0xe0) == 0xc0 && |
| (input[1] & 0xc0) == 0x80) { |
| *symbol = (((input[0] & 0x1f) << 6) | |
| (input[1] & 0x3f)); |
| if (*symbol > 0x7f) { |
| return 2; |
| } |
| } |
| /* 3-byte UFT8 */ |
| if (size > 2u && |
| (input[0] & 0xf0) == 0xe0 && |
| (input[1] & 0xc0) == 0x80 && |
| (input[2] & 0xc0) == 0x80) { |
| *symbol = (((input[0] & 0x0f) << 12) | |
| ((input[1] & 0x3f) << 6) | |
| (input[2] & 0x3f)); |
| if (*symbol > 0x7ff) { |
| return 3; |
| } |
| } |
| /* 4-byte UFT8 */ |
| if (size > 3u && |
| (input[0] & 0xf8) == 0xf0 && |
| (input[1] & 0xc0) == 0x80 && |
| (input[2] & 0xc0) == 0x80 && |
| (input[3] & 0xc0) == 0x80) { |
| *symbol = (((input[0] & 0x07) << 18) | |
| ((input[1] & 0x3f) << 12) | |
| ((input[2] & 0x3f) << 6) | |
| (input[3] & 0x3f)); |
| if (*symbol > 0xffff && *symbol <= 0x10ffff) { |
| return 4; |
| } |
| } |
| /* Not UTF8, emit a special symbol above the UTF8-code space */ |
| *symbol = 0x110000 | input[0]; |
| return 1; |
| } |
| |
| /* Returns 1 if at least min_fraction of the data is UTF8-encoded.*/ |
| BROTLI_BOOL BrotliIsMostlyUTF8( |
| const uint8_t* data, const size_t pos, const size_t mask, |
| const size_t length, const double min_fraction) { |
| size_t size_utf8 = 0; |
| size_t i = 0; |
| while (i < length) { |
| int symbol; |
| size_t bytes_read = |
| BrotliParseAsUTF8(&symbol, &data[(pos + i) & mask], length - i); |
| i += bytes_read; |
| if (symbol < 0x110000) size_utf8 += bytes_read; |
| } |
| return TO_BROTLI_BOOL(size_utf8 > min_fraction * (double)length); |
| } |
| |
| #if defined(__cplusplus) || defined(c_plusplus) |
| } /* extern "C" */ |
| #endif |