Ben Murdoch | bb769b2 | 2010-08-11 14:56:33 +0100 | [diff] [blame] | 1 | // Copyright 2007-2010 the V8 project authors. All rights reserved. |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame] | 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 4 | |
| 5 | #ifndef V8_UNICODE_INL_H_ |
| 6 | #define V8_UNICODE_INL_H_ |
| 7 | |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame] | 8 | #include "src/unicode.h" |
| 9 | #include "src/base/logging.h" |
| 10 | #include "src/utils.h" |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 11 | |
| 12 | namespace unibrow { |
| 13 | |
| 14 | template <class T, int s> bool Predicate<T, s>::get(uchar code_point) { |
| 15 | CacheEntry entry = entries_[code_point & kMask]; |
Emily Bernier | d0a1eb7 | 2015-03-24 16:35:39 -0400 | [diff] [blame] | 16 | if (entry.code_point() == code_point) return entry.value(); |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 17 | return CalculateValue(code_point); |
| 18 | } |
| 19 | |
| 20 | template <class T, int s> bool Predicate<T, s>::CalculateValue( |
| 21 | uchar code_point) { |
| 22 | bool result = T::Is(code_point); |
| 23 | entries_[code_point & kMask] = CacheEntry(code_point, result); |
| 24 | return result; |
| 25 | } |
| 26 | |
| 27 | template <class T, int s> int Mapping<T, s>::get(uchar c, uchar n, |
| 28 | uchar* result) { |
| 29 | CacheEntry entry = entries_[c & kMask]; |
| 30 | if (entry.code_point_ == c) { |
| 31 | if (entry.offset_ == 0) { |
| 32 | return 0; |
| 33 | } else { |
| 34 | result[0] = c + entry.offset_; |
| 35 | return 1; |
| 36 | } |
| 37 | } else { |
| 38 | return CalculateValue(c, n, result); |
| 39 | } |
| 40 | } |
| 41 | |
| 42 | template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n, |
| 43 | uchar* result) { |
| 44 | bool allow_caching = true; |
| 45 | int length = T::Convert(c, n, result, &allow_caching); |
| 46 | if (allow_caching) { |
| 47 | if (length == 1) { |
| 48 | entries_[c & kMask] = CacheEntry(c, result[0] - c); |
| 49 | return 1; |
| 50 | } else { |
| 51 | entries_[c & kMask] = CacheEntry(c, 0); |
| 52 | return 0; |
| 53 | } |
| 54 | } else { |
| 55 | return length; |
| 56 | } |
| 57 | } |
| 58 | |
| 59 | |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame] | 60 | unsigned Utf8::EncodeOneByte(char* str, uint8_t c) { |
| 61 | static const int kMask = ~(1 << 6); |
| 62 | if (c <= kMaxOneByteChar) { |
| 63 | str[0] = c; |
| 64 | return 1; |
| 65 | } |
| 66 | str[0] = 0xC0 | (c >> 6); |
| 67 | str[1] = 0x80 | (c & kMask); |
| 68 | return 2; |
| 69 | } |
| 70 | |
| 71 | // Encode encodes the UTF-16 code units c and previous into the given str |
| 72 | // buffer, and combines surrogate code units into single code points. If |
| 73 | // replace_invalid is set to true, orphan surrogate code units will be replaced |
| 74 | // with kBadChar. |
| 75 | unsigned Utf8::Encode(char* str, |
| 76 | uchar c, |
| 77 | int previous, |
| 78 | bool replace_invalid) { |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 79 | static const int kMask = ~(1 << 6); |
| 80 | if (c <= kMaxOneByteChar) { |
| 81 | str[0] = c; |
| 82 | return 1; |
| 83 | } else if (c <= kMaxTwoByteChar) { |
| 84 | str[0] = 0xC0 | (c >> 6); |
| 85 | str[1] = 0x80 | (c & kMask); |
| 86 | return 2; |
| 87 | } else if (c <= kMaxThreeByteChar) { |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame] | 88 | if (Utf16::IsSurrogatePair(previous, c)) { |
Ben Murdoch | 3ef787d | 2012-04-12 10:51:47 +0100 | [diff] [blame] | 89 | const int kUnmatchedSize = kSizeOfUnmatchedSurrogate; |
| 90 | return Encode(str - kUnmatchedSize, |
| 91 | Utf16::CombineSurrogatePair(previous, c), |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame] | 92 | Utf16::kNoPreviousCharacter, |
| 93 | replace_invalid) - kUnmatchedSize; |
| 94 | } else if (replace_invalid && |
| 95 | (Utf16::IsLeadSurrogate(c) || |
| 96 | Utf16::IsTrailSurrogate(c))) { |
| 97 | c = kBadChar; |
Ben Murdoch | 3ef787d | 2012-04-12 10:51:47 +0100 | [diff] [blame] | 98 | } |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 99 | str[0] = 0xE0 | (c >> 12); |
| 100 | str[1] = 0x80 | ((c >> 6) & kMask); |
| 101 | str[2] = 0x80 | (c & kMask); |
| 102 | return 3; |
| 103 | } else { |
| 104 | str[0] = 0xF0 | (c >> 18); |
| 105 | str[1] = 0x80 | ((c >> 12) & kMask); |
| 106 | str[2] = 0x80 | ((c >> 6) & kMask); |
| 107 | str[3] = 0x80 | (c & kMask); |
| 108 | return 4; |
| 109 | } |
| 110 | } |
| 111 | |
| 112 | |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 113 | uchar Utf8::ValueOf(const byte* bytes, size_t length, size_t* cursor) { |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 114 | if (length <= 0) return kBadChar; |
| 115 | byte first = bytes[0]; |
| 116 | // Characters between 0000 and 0007F are encoded as a single character |
| 117 | if (first <= kMaxOneByteChar) { |
| 118 | *cursor += 1; |
| 119 | return first; |
| 120 | } |
| 121 | return CalculateValue(bytes, length, cursor); |
| 122 | } |
| 123 | |
Ben Murdoch | 3ef787d | 2012-04-12 10:51:47 +0100 | [diff] [blame] | 124 | unsigned Utf8::Length(uchar c, int previous) { |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 125 | if (c <= kMaxOneByteChar) { |
| 126 | return 1; |
| 127 | } else if (c <= kMaxTwoByteChar) { |
| 128 | return 2; |
| 129 | } else if (c <= kMaxThreeByteChar) { |
Ben Murdoch | 3ef787d | 2012-04-12 10:51:47 +0100 | [diff] [blame] | 130 | if (Utf16::IsTrailSurrogate(c) && |
| 131 | Utf16::IsLeadSurrogate(previous)) { |
| 132 | return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates; |
| 133 | } |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 134 | return 3; |
| 135 | } else { |
| 136 | return 4; |
| 137 | } |
| 138 | } |
| 139 | |
Ben Murdoch | c561043 | 2016-08-08 18:44:38 +0100 | [diff] [blame] | 140 | bool Utf8::IsValidCharacter(uchar c) { |
| 141 | return c < 0xD800u || (c >= 0xE000u && c < 0xFDD0u) || |
| 142 | (c > 0xFDEFu && c <= 0x10FFFFu && (c & 0xFFFEu) != 0xFFFEu && |
| 143 | c != kBadChar); |
| 144 | } |
| 145 | |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 146 | } // namespace unibrow |
| 147 | |
| 148 | #endif // V8_UNICODE_INL_H_ |